Main code¶

In [ ]:
import os
import numpy as np
import scipy.io
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random
import sys

sklearn.linear_model.LinearRegression

sklearn.linear_model.Ridge

sklearn.linear_model.ElasticNet

sklearn.linear_model.LogisticRegression

Read and preprocess mat files¶

After reading and preprocessing the data in the next code chunk, the data from all cells is stored in a population dictionary. The dictionary's keys correspond to the cell names, and the values contain the data for each respective cell.

The values in the population dictionary are cell dictionaries with keys being 'axons', 'green_dFFMeanValues',and 'red_dFFMeanValues':

  • The value of 'axons', e.g., cell_data_dict['CL090_230515']['axons'] is a 1 dimensional numpy array, of which the length is the number of groups and the elements are 1 dimensional numpy arrays consisting of components belonging to the group.
  • The value of 'green_dFFMeanValues' is a 2 dimensional 3 by 49 numpy array (each cell has 3 rounds, and each round has 8 directions * 2 time frequencies * 3 space frequencies = 48 settings plus a extra period so in total there are 49 columns), of which the elements are still 2 dimensional numpy arrays with size being 10 by N (N is the number of components).
  • The value of 'red_dFFMeanValues' is similarly a 2 dimensional 3 by 49 numpy array, of which the elements are still 2 dimensional numpy arrays with size being 10 by 1 (only recording the data at the soma).
In [ ]:
root_path = "/content/drive/MyDrive/Fluorescence_Data/FluoData4Fitting_Average"

# Get a list of all the subdirectories: subfolders are viewed as cell names
cell_names = [f for f in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, f))]
# for cell in cell_names:
#     print(cell)

# Create a dictionary with default values
default_value = 0
cell_data_dict = {cell: default_value for cell in cell_names}
# print(cell_data_dict)

file_suffixes = ['green_Axon.mat', 'green_dFFMeanValues.mat', 'red_dFFMeanValues.mat']

for cell in cell_names:
    print(cell)
    file_names = [cell + suffix for suffix in file_suffixes]

    path_ = file_names[0] # green_Axon.mat
    path_ = os.path.join(root_path, cell, path_)
    mat_data = scipy.io.loadmat(path_)
    axons = mat_data['Axons'] # array containing nested arrays/sub-arrays
    # Squeeze the outer array
    axons = np.squeeze(axons, axis=0)
    for i in range(len(axons)):
        # Squeeze the inner array and convert the data type to 'int'
        axons[i] = np.squeeze(axons[i].astype(int), axis=0)
    # final axons' length is the number of groups with
    # each elements being a nested array of components

    path_ = file_names[1] # green_dFFMeanValues.mat
    path_ = os.path.join(root_path, cell, path_)
    mat_data = scipy.io.loadmat(path_)
    dFFMeanValues_green = mat_data['dFFMeanValues'] # 3 by 49

    path_ = file_names[2] # red_dFFMeanValues.mat
    path_ = os.path.join(root_path, cell, path_)
    mat_data = scipy.io.loadmat(path_)
    dFFMeanValues_red = mat_data['dFFMeanValues'] # 3 by 49

    cell_data_dict[cell] = {'axons': axons,
                'green_dFFMeanValues': dFFMeanValues_green,
                'red_dFFMeanValues': dFFMeanValues_red}

# Print keys and types
for key, value in cell_data_dict.items():
    print("-- * * * * * --")
    print(key, type(value))
    for key_, value_ in value.items():
        print(key_, type(value_))
print("-- * * * * * --")
CL090_230515
CL090_230518
CL083_230413
CL075_230303
-- * * * * * --
CL090_230515 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
CL090_230518 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
CL083_230413 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
CL075_230303 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
In [ ]:
# print to see data type and size
print(cell_data_dict['CL090_230515'].keys())
print(type(cell_data_dict['CL090_230515']['axons']))
print(cell_data_dict['CL090_230515']['axons'].shape)
print(cell_data_dict['CL090_230515']['axons'][1].shape)
print(cell_data_dict['CL090_230515']['axons'][1].dtype)
print("--------------------------------")
print(type(cell_data_dict['CL090_230515']['green_dFFMeanValues']))
print(cell_data_dict['CL090_230515']['green_dFFMeanValues'].shape)
print(type(cell_data_dict['CL090_230515']['green_dFFMeanValues'][1,1]))
print(cell_data_dict['CL090_230515']['green_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['green_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['green_dFFMeanValues'][0,1].dtype)
print("--------------------------------")
print(type(cell_data_dict['CL090_230515']['red_dFFMeanValues']))
print(cell_data_dict['CL090_230515']['red_dFFMeanValues'].shape)
print(type(cell_data_dict['CL090_230515']['red_dFFMeanValues'][1,1]))
print(cell_data_dict['CL090_230515']['red_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['red_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['red_dFFMeanValues'][0,1].dtype)
dict_keys(['axons', 'green_dFFMeanValues', 'red_dFFMeanValues'])
<class 'numpy.ndarray'>
(25,)
(19,)
int64
--------------------------------
<class 'numpy.ndarray'>
(3, 49)
<class 'numpy.ndarray'>
(10, 281)
(10, 155)
float64
--------------------------------
<class 'numpy.ndarray'>
(3, 49)
<class 'numpy.ndarray'>
(10, 1)
(10, 2)
float64

Note:

  • Four cells: 'CL090_230515', 'CL090_230518', 'CL083_230413', 'CL075_230303'.

  • 'red_dFFMeanValues' and 'green_dFFMeanValues' have 49 columns,where the last column should be excluded. They are supposed to have 3 rows (3 rounds), but 'CL090_230518' only has 2 rows.

  • In 'CL083_230413', elements in 'red_dFFMeanValues' have 2 columns (10 × 2, should be 10 × 1), so 'CL083_230413' is not used.

Fit data to model¶

Functions¶

In [ ]:
def plot_comparison(y_test, y_pred, subtitle = ''):
    # Sort y_pred and y_test based on y_test
    sorted_indices = np.argsort(y_test)
    sorted_y_pred = y_pred[sorted_indices]
    sorted_y_test = y_test[sorted_indices]
    # Plot sorted_y_pred and sorted_y_test
    plt.plot(sorted_y_pred, label='Sorted Predictions')
    plt.plot(sorted_y_test, label='Sorted Ground Truth')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title(f'Comparison of Sorted Predictions and Sorted Ground Truth \n ({subtitle})')
    plt.legend()
    plt.savefig(f'Comparison of Sorted Predictions and Sorted Ground Truth ({subtitle}).png', bbox_inches='tight')
    # bbox_inches='tight' will adjust the figure's bounding box to fit all the content, ensuring that
    # the complete words are visible in the saved figure. Otherwise, the saved figure may not show the
    # complete words, e.g., for x-label, or for the long title.
    # The default setting can sometimes result in cutoff or clipped text. It tries to include the entire
    # figure within the saved image, but there may be cases where the default behavior is not sufficient
    # to capture all the content. The default behavior assumes the figure content fits within the predefined
    # margins and padding.
    plt.show()

Reorganize the data (vstack)¶

In [ ]:
cell_data = cell_data_dict['CL090_230515']

data_green = cell_data['green_dFFMeanValues'][:,:-1] # exclude 49th column
data_red = cell_data['red_dFFMeanValues'][:,:-1] # exclude 49th column
data_axons = cell_data['axons']

# vstack green data
stacked_green = np.empty((0, data_green[0,0].shape[1]))
# Enumerate the elements in the np array and vstack them
for index, value in np.ndenumerate(data_green):
    stacked_green = np.vstack((stacked_green, value))
print(stacked_green.shape, 48*3*10)

# group columns of green data
group_num = data_axons.shape[0]
group_satcked_green = np.zeros((stacked_green.shape[0], group_num))
for i, cols in enumerate(data_axons):
    group_satcked_green[:, i] = np.sum(stacked_green[:, cols-1], axis=1)
print(group_satcked_green.shape, data_axons.shape)

# vstack red data
stacked_red = np.empty((0, data_red[0,0].shape[1]))
# Enumerate the elements in the np array and vstack them
for index, value in np.ndenumerate(data_red):
    stacked_red = np.vstack((stacked_red, value))
print(stacked_red.shape, 48*3*10)

print(np.max(group_satcked_green), np.min(group_satcked_green))
print(np.max(stacked_red), np.min(stacked_red))
(1440, 281) 1440
(1440, 25) (25,)
(1440, 1) 1440
15.553237533160676 -5.301797778486449
0.6745208147710272 -0.24153973313562027

Linear regression¶

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
x_train shape: (1368, 25)
y_train shape: (1368,)
x_test shape: (72, 25)
y_test shape: (72,)

Ordinary linear regression¶

Ordinary least squares Linear Regression.

Linear Regression fits a linear model with coefficients to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

Fit and predict¶
In [ ]:
# Create a LinearRegression object
model = linear_model.LinearRegression()

# Fit the model on the training data
model.fit(x_train, y_train)
# Print the fitted coefficients
print("Fitted Coefficients:", model.coef_)
# Print the fitted intercept
print("Fitted Intercept:", model.intercept_)

# Predict on the test data
y_pred = model.predict(x_test)
Fitted Coefficients: [ 8.28301566e-03  7.56874223e-03  2.17645374e-04 -2.06269935e-03
  3.53903502e-03  5.16754189e-03  1.24691680e-03  2.32002780e-02
  2.40241470e-03 -6.16858257e-03 -1.20710729e-03  2.78029746e-02
  9.57603350e-03 -4.35972960e-03  3.04637298e-03  6.77257749e-03
  8.29508492e-04  3.04802829e-02 -6.67335217e-04  9.61650591e-03
 -2.47112388e-02 -1.14696165e-03  8.29246943e-02 -1.13145597e-02
 -3.19379843e-01]
Fitted Intercept: -0.003966473464240797
Evaluate¶
In [ ]:
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Mean squared error: 0.007927335708322457
Correlation coefficient: 0.6140976886111208
Coefficient of determination (R-squared score, R2 score): 0.3610848833977214
In [ ]:
plot_comparison(y_test, y_pred, 'Ordinary Linear Regression, Test Set')
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Mean squared error: 0.0061596970599993445
Correlation coefficient: 0.6571303587882936
Coefficient of determination (R-squared score, R2 score): 0.43182030844123154
In [ ]:
plot_comparison(y_train, y_pred_, 'Ordinary Linear Regression, Train Set')
In [ ]:
# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
Mean squared error: 2.486111111111111
Correlation coefficient: 0.6292717941149766
Coefficient of determination (R-squared score, R2 score): 0.36944077498899164
Mean squared error: 2.0255847953216373
Correlation coefficient: 0.6340286499988343
Coefficient of determination (R-squared score, R2 score): 0.4011018872884826

Ridge linear regression¶

Linear least squares with l2 regularization.

Minimizes the objective function:

$$ ||y - Xw||^2_2 + \alpha ||w||^2_2 $$

This model solves a regression model where the loss function is the linear least squares function and regularization is given by the l2-norm.

Fit and predict¶
In [ ]:
# Create a Ridge Regression object
ridge_model = linear_model.Ridge(alpha=1.0)  # You can adjust the value of alpha as per your requirements

# Fit the model on the training data
ridge_model.fit(x_train, y_train)

# Print the fitted coefficients
print("Fitted Coefficients:", ridge_model.coef_)

# Print the fitted intercept
print("Fitted Intercept:", ridge_model.intercept_)

# Predict on the test data
y_pred = ridge_model.predict(x_test)
Fitted Coefficients: [ 0.00849453  0.00740115  0.00016157 -0.0020944   0.0034469   0.00444959
  0.00196809  0.0228857   0.0024516  -0.00571464 -0.00091334  0.0277737
  0.00929144 -0.00475298  0.00350069  0.00617136 -0.00020173  0.02975458
 -0.00172438  0.01062452 -0.02926797 -0.00329156  0.01071774 -0.01549336
 -0.06836985]
Fitted Intercept: -0.003992042396819981
Evaluate¶
In [ ]:
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Ridge Linear Regression, Test Set')
Mean squared error: 0.008058648873285792
Correlation coefficient: 0.6047541953182045
Coefficient of determination (R-squared score, R2 score): 0.35050150845424777
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = ridge_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Ridge Linear Regression, Train Set')
Mean squared error: 0.00617267529320373
Correlation coefficient: 0.6562194390940245
Coefficient of determination (R-squared score, R2 score): 0.43062317675322503
In [ ]:
# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
Mean squared error: 2.486111111111111
Correlation coefficient: 0.6290805964856068
Coefficient of determination (R-squared score, R2 score): 0.36944077498899164
Mean squared error: 2.038011695906433
Correlation coefficient: 0.6311655529544047
Coefficient of determination (R-squared score, R2 score): 0.3974276657381052

ElasticNet linear regression¶

Linear regression with combined L1 and L2 priors as regularizer.

Minimizes the objective function:

$$ 1 / (2 * n_{samples}) * ||y - Xw||^2_2 + \alpha * l1_{ratio} * ||w||_1 + 0.5 * \alpha * (1 - l1_{ratio}) * ||w||^2_2 $$

If controlling the L1 and L2 penalty separately, that this is equivalent to:

$$ a * ||w||_1 + 0.5 * b * ||w||_2^2 $$

where: $\alpha = a + b$ and $l1_{ratio} = a / (a + b)$.

Fit and predict¶
In [ ]:
# Create an ElasticNet object
a = 0.004; b = 0.00
alpha = a + b; l1_ratio = a / (a + b)
elasticnet_model = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=100000)
# adjust the values of alpha and l1_ratio as per your requirements

# Fit the model on the training data
elasticnet_model.fit(x_train, y_train)

# Print the fitted coefficients
print("Fitted Coefficients:", elasticnet_model.coef_)

# Print the fitted intercept
print("Fitted Intercept:", elasticnet_model.intercept_)

# Predict on the test data
y_pred = elasticnet_model.predict(x_test)

# It is normal to encounter warning here, because our data is not linear enough
# and not normailized to guarantee a low error/residual, even though we set a very
# large max_iter. But the results is similar to that of ordinary and Ridge linear
# regression. Like alpha = 0 results in the same results as the ordinary linear
# rergession.
Fitted Coefficients: [ 0.00951724  0.0068485   0.         -0.          0.00285676  0.
  0.          0.01545406  0.00635406 -0.          0.          0.02601547
  0.00529342 -0.          0.          0.00326147  0.          0.
  0.          0.         -0.          0.          0.         -0.
 -0.        ]
Fitted Intercept: -0.004587622786060119
Evaluate¶
In [ ]:
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Elasticnet Linear Regression, Test Set')
Mean squared error: 0.008246618904776503
Correlation coefficient: 0.593481482809575
Coefficient of determination (R-squared score, R2 score): 0.33535179119658987
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = elasticnet_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Elasticnet Linear Regression, Train Set')
Mean squared error: 0.006385274236585692
Correlation coefficient: 0.6428087118878939
Coefficient of determination (R-squared score, R2 score): 0.4110127314829676
In [ ]:
# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
Mean squared error: 2.7777777777777777
Correlation coefficient: 0.566099913342907
Coefficient of determination (R-squared score, R2 score): 0.2954645530603258
Mean squared error: 2.1052631578947367
Correlation coefficient: 0.6144590454550366
Coefficient of determination (R-squared score, R2 score): 0.3775436432301804

Power-law regression¶

Mathematically, a power-law relationship can be expressed as:

$$ y = A X^C $$

Here, I modify it, shown as:

$$ y = A (X+B)^C + D $$

where, $X = \beta_1 x_1 + \beta_2 x_2 \dots + \beta_N x_N$. $X+D$ is a linear regression part. $A$, $B$, $C$, $D$, $\beta_1$, $\beta_2$, ...,$\beta_N$ are parameters to be determined.

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
x_train shape: (1368, 25)
y_train shape: (1368,)
x_test shape: (72, 25)
y_test shape: (72,)

Fit, predict and evaluate (Example 1)¶

In [ ]:
exponent = 5
# Define the model function
def func(X, *params):
    A, B, D = params[:3]
    # Compute the weighted sum
    weighted_sum = np.sum(X * np.array(params[3:]), axis=1)
    base = weighted_sum + B
    power_result = np.power(base, exponent)
    return A * power_result + D

# Create a LinearRegression object
model = linear_model.LinearRegression()

# give the initial params using linear regression
# so that the params are within a reasonable range
model.fit(x_train, y_train)
# print("Fitted Coefficients:", model.coef_)
# print("Fitted Intercept:", model.intercept_)
num_features = x_train.shape[1]
# Assuming model.coef_ is the np array containing the coefficients
model_coefs = model.coef_
# Set negative elements to 0 using np.clip()
initial_params = [1, model.intercept_, 0] + list(model_coefs) # Initial parameter guesses
# model_coefs_clipped = np.clip(model_coefs, 0, np.inf)
# initial_params = [1, model.intercept_, 0] + list(model_coefs_clipped) # Initial parameter guesses, no need to clip -- they can be negative

# Set lower and upper bounds for the parameters
lower_bounds = [0, -np.inf, -np.inf] + [0] * num_features
upper_bounds = [np.inf, np.inf, np.inf] + [np.inf] * num_features

# Combine the lower and upper bounds into a 2-tuple of array_like
bounds = (lower_bounds, upper_bounds)

# Perform the curve fit with bounds
params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)
# params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, bounds=bounds, maxfev=1000000)

# Print the fitted parameters
print("Fitted Parameters:", params)

# predict on test
A, B, D = params[:3]
weighted_sum = np.sum(x_test * np.array(params[3:]), axis=1)
base = weighted_sum + B
abs_base = np.abs(base)
sign = np.sign(base)
power_result = np.power(abs_base, exponent)
y_pred = A * sign * power_result + D

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Power-Law Regression (Exponent=5), Test Set')


# predict on train
A, B, D = params[:3]
weighted_sum = np.sum(x_train * np.array(params[3:]), axis=1)
base = weighted_sum + B
abs_base = np.abs(base)
sign = np.sign(base)
power_result = np.power(abs_base, exponent)
y_pred_ = A * sign * power_result + D

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Power-Law Regression (Exponent=5), Train Set')

# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Fitted Parameters: [ 3.63589934e+00  5.17800351e-01 -1.38641935e-01  4.83492282e-03
  4.37660663e-03 -3.27563273e-04 -1.65395632e-05  2.76270197e-03
  3.60564181e-03  1.18571822e-03  1.39874639e-02  9.74478013e-04
 -4.33905140e-03  3.12368809e-04  1.63689641e-02  5.70510078e-03
 -3.84107281e-03  9.86178700e-04  3.44528993e-03  1.88010948e-04
  1.79227711e-02 -6.87188789e-03  6.36542080e-03 -1.91319680e-02
 -5.91287026e-04  3.77753461e-02 -1.06618770e-02 -1.74782336e-01]
Mean squared error: 0.007801938801087381
Correlation coefficient: 0.6219617870348478
Coefficient of determination (R-squared score, R2 score): 0.3711914289705994
Mean squared error: 0.005965096684632098
Correlation coefficient: 0.6706493415945699
Coefficient of determination (R-squared score, R2 score): 0.44977053881398854
---- ---- ----
Mean squared error: 2.611111111111111
Correlation coefficient: 0.6064419746929176
Coefficient of determination (R-squared score, R2 score): 0.3377366798767063
Mean squared error: 1.939327485380117
Correlation coefficient: 0.653816054551323
Coefficient of determination (R-squared score, R2 score): 0.42660530746169045

Fit, predict and evaluate (Example 2)¶

In [ ]:
# compared with example 1, here only fit A and D.

exponent = 5

model = linear_model.LinearRegression()
model.fit(x_train, y_train)
# now we have: model.intercept_ and model.coef_
# Print the fitted coefficients
print("Fitted Coefficients:", model.coef_)
# Print the fitted intercept
print("Fitted Intercept:", model.intercept_)

# Define the model function
def func(X, A, D):
    # Compute the weighted sum
    weighted_sum = np.sum(X * np.array(model.coef_), axis=1)
    base = weighted_sum
    power_result = np.power(base, exponent)
    return A * power_result + D

initial_params = [1, 0]

# Perform the curve fit
params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)

# Print the fitted parameters
print("Fitted Parameters:", params)


# predict on test
A, D = params
y_pred = func(x_test, A, D)

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Power-Law Regression Exponent=5, only fit A and D, Test Set')

# predict on train
A, D = params
y_pred_ = func(x_train, A, D)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Power-Law Regression Exponent=5, only fit A and D, Train Set')
Fitted Coefficients: [ 8.28301566e-03  7.56874223e-03  2.17645374e-04 -2.06269935e-03
  3.53903502e-03  5.16754189e-03  1.24691680e-03  2.32002780e-02
  2.40241470e-03 -6.16858257e-03 -1.20710729e-03  2.78029746e-02
  9.57603350e-03 -4.35972960e-03  3.04637298e-03  6.77257749e-03
  8.29508492e-04  3.04802829e-02 -6.67335217e-04  9.61650591e-03
 -2.47112388e-02 -1.14696165e-03  8.29246943e-02 -1.13145597e-02
 -3.19379843e-01]
Fitted Intercept: -0.003966473464240797
Fitted Parameters: [3.06877195e+02 3.73110121e-02]
Mean squared error: 0.010403693780630378
Correlation coefficient: 0.4740522878239656
Coefficient of determination (R-squared score, R2 score): 0.16149921366803333
Mean squared error: 0.00901247587500699
Correlation coefficient: 0.41070148964947206
Coefficient of determination (R-squared score, R2 score): 0.16867571360029543

Fit and predict¶

In [ ]:
# generate irreducible fraction with an odd number as the denominator
# such numbers can work as the exponent for negative numbers and
# will be used as the parameter "C" in Power-law regression below:
# y = A * (B+b1*x1+b2*x2+...+bN*xN)**C + D

def gcd(a, b): # calculate the greatest common divisor of two numbers
    while b:
        a, b = b, a % b
    return a

def generate_irreducible_fraction(existing_fractions = []):
    while True:
        numerator = random.randint(1, 400)  # Random numerator
        denominator = random.randrange(1, 100, 2)  # Random odd denominator
        if gcd(numerator, denominator) == 1:  # Check if the fraction is irreducible
            fraction = (numerator, denominator)
            if fraction not in existing_fractions:  # Check if the fraction is not a duplicate
                return fraction

# Generate irreducible fraction numbers
N_faction = 40
upper_bound = 50
irreducible_fractions = []
while len(irreducible_fractions) < N_faction:
    fraction = generate_irreducible_fraction(irreducible_fractions)
    if fraction[0]/fraction[1] < upper_bound:
        irreducible_fractions.append(fraction)

# Sort the irreducible fractions
irreducible_fractions.sort(key=lambda f: f[0] / f[1])

# Print the irreducible fraction numbers
for numerator, denominator in irreducible_fractions:
    if random.random() <= 0.1:
        print(f"{numerator}/{denominator}")


# Extract the numerator and denominator values
indexes = range(1, len(irreducible_fractions) + 1)
values = [numerator / denominator for numerator, denominator in irreducible_fractions]

# Plot the irreducible fractions
plt.plot(values, 'o-')
plt.xlabel("Index")
plt.ylabel("Irreducible Fraction")
plt.title("Irreducible Fractions")
plt.show()

## if not use the above random generated irreducible_fractions, define it below.
irreducible_fractions = [(1,95), (30,43), (179,65), (5,1), (221,33), (219,23), (300,17), (73,3)]
3/11
60/83
382/19
In [ ]:
# # old code! The new one is in the next chunk.

# params_list = []
# for numerator, denominator in irreducible_fractions:
#     C1, C2 = numerator, denominator

#     # Define the model function
#     def func(X, *params):
#         A, B, D = params[:3]
#         # Compute the weighted sum
#         weighted_sum = np.sum(X * np.array(params[3:]), axis=1)
#         base = weighted_sum + B
#         abs_base = np.abs(base)
#         sign = np.sign(base)
#         power_result = np.power(abs_base, C1 / C2)
#         return A * sign * power_result + D

#     # Create a LinearRegression object
#     model = linear_model.LinearRegression()

#     # give the initial params using linear regression
#     # so that the params are within a reasonable range
#     model.fit(x_train, y_train)
#     # print("Fitted Coefficients:", model.coef_)
#     # print("Fitted Intercept:", model.intercept_)
#     num_features = x_train.shape[1]
#     initial_params = [1, model.intercept_, 0] + list(model.coef_) # Initial parameter guesses

#     # Perform the curve fit
#     params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)
#     params_list.append(params)

#     # Print the fitted parameters
#     # print("Fitted Parameters:", params)

# # predict on test
# y_predict_test_list = []
# for (numerator, denominator), params in zip(irreducible_fractions, params_list):
#     C1, C2 = numerator, denominator
#     A, B, D = params[:3]
#     weighted_sum = np.sum(x_test * np.array(params[3:]), axis=1)
#     base = weighted_sum + B
#     abs_base = np.abs(base)
#     sign = np.sign(base)
#     power_result = np.power(abs_base, C1 / C2)
#     y_pred = A * sign * power_result + D
#     y_predict_test_list.append(y_pred)

# # predict on train
# y_predict_train_list = []
# for (numerator, denominator), params in zip(irreducible_fractions, params_list):
#     C1, C2 = numerator, denominator
#     A, B, D = params[:3]
#     weighted_sum = np.sum(x_train * np.array(params[3:]), axis=1)
#     base = weighted_sum + B
#     abs_base = np.abs(base)
#     sign = np.sign(base)
#     power_result = np.power(abs_base, C1 / C2)
#     y_pred_ = A * sign * power_result + D
#     y_predict_train_list.append(y_pred_)
In [ ]:
# new code, a updated version for the code in last chunk.
# input and output data normalization is achieved (but it
# seems normalization is redundant).

class CurveFit_with_Normalization:
    def __init__(self, exponent_numerator=1, exponent_denominator=1, input_range=[1,2], output_range=[1,2]):
        self.input_min = input_range[0]
        self.input_max = input_range[1]
        self.output_min = output_range[0]
        self.output_max = output_range[1]
        self.input_scale = None
        self.input_shift = None
        self.output_scale = None
        self.output_shift = None
        self.exponent_numerator = exponent_numerator
        self.exponent_denominator = exponent_denominator
        self.linear_model = linear_model.LinearRegression()

    def fit(self, X, y):
        # Normalize the input and output data
        self.input_scale = (self.input_max - self.input_min) / (np.max(X) - np.min(X))
        self.input_shift = self.input_min - np.min(X) * self.input_scale
        normalized_X = self.input_scale * X + self.input_shift

        self.output_scale = (self.output_max - self.output_min) / (np.max(y) - np.min(y))
        self.output_shift = self.output_min - np.min(y) * self.output_scale
        normalized_y = self.output_scale * y + self.output_shift

        def normalized_func_(X_normalize, *params):
            A, B, D = params[:3]
            # Compute the weighted sum
            weighted_sum = np.sum(X_normalize * np.array(params[3:]), axis=1)
            base = weighted_sum + B
            abs_base = np.abs(base)
            sign = np.sign(base)
            power_result = np.power(abs_base, self.exponent_numerator / self.exponent_denominator)
            return A * sign * power_result + D

        # Give the initial params using linear regression
        self.linear_model.fit(normalized_X, normalized_y)
        # print("Fitted Coefficients:", linear_model.coef_)
        # print("Fitted Intercept:", linear_model.intercept_)
        # num_features = X.shape[1]
        exponent = self.exponent_numerator / self.exponent_denominator
        initial_params = [1, self.linear_model.intercept_ / exponent + (1 - 1 / exponent), 0] + list(self.linear_model.coef_ /
                        exponent)  # Initial parameter guesses
        # initial_params = [1, 0, 0] + [1 / X.shape[1]] * (X.shape[1])

        # Perform the normalized curve fit
        normalized_params, params_covariance = curve_fit(normalized_func_, normalized_X, normalized_y,
                                  p0=initial_params, maxfev=100000000)

        # Store the fitted parameters
        self.normalized_fitted_params = normalized_params

    def predict(self, X):
        # Normalize the input data using the previously calculated scaling and shifting parameters
        normalized_X = self.input_scale * X + self.input_shift

        # Make predictions using the denormalized parameters
        y_pred = self.normalized_func(normalized_X, self.exponent_numerator,
                              self.exponent_denominator, *self.normalized_fitted_params)

        # Denormalize the predicted output
        y_pred = (y_pred - self.output_shift) / self.output_scale

        return y_pred

    @staticmethod
    def normalized_func(X_normalize, exponent_numerator, exponent_denominator, *params):
        A, B, D = params[:3]
        # Compute the weighted sum
        weighted_sum = np.sum(X_normalize * np.array(params[3:]), axis=1)
        base = weighted_sum + B
        abs_base = np.abs(base)
        sign = np.sign(base)
        power_result = np.power(abs_base, exponent_numerator / exponent_denominator)
        return A * sign * power_result + D

model_list = []
for numerator, denominator in irreducible_fractions:
    # Create an instance of NormalizedCurveFit
    model = CurveFit_with_Normalization(numerator, denominator, input_range=[1, 2], output_range=[1, 2])

    # Fit the model to your input and output data
    model.fit(x_train, y_train)

    model_list.append(model)
In [ ]:
# this is for the new version code in the last chunk

r2_score_test_list = []

# predict on test
y_predict_test_list = []
for model, (numerator, denominator) in zip(model_list, irreducible_fractions):
    y_pred = model.predict(x_test)

    print(f'-------- \n (numerator, denominator) is: ({numerator}, {denominator})')

    mse = mean_squared_error(y_test, y_pred)
    print("Mean squared error:", mse)

    correlation = np.corrcoef(y_pred, y_test)[0, 1]
    print("Correlation coefficient:", correlation)

    r_squared = r2_score(y_test, y_pred)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)

    r2_score_test_list.append(r_squared)

    y_predict_test_list.append(y_pred)

print("|||||||||||||||||||||||||||||||||||||")

r2_score_train_list = []

# predict on train
y_predict_train_list = []
for model, (numerator, denominator) in zip(model_list, irreducible_fractions):
    y_pred_ = model.predict(x_train)

    print(f'-------- \n (numerator, denominator) is: ({numerator}, {denominator})')

    mse = mean_squared_error(y_train, y_pred_)
    print("Mean squared error:", mse)

    # Calculate the correlation coefficient
    correlation = np.corrcoef(y_pred_, y_train)[0, 1]
    print("Correlation coefficient:", correlation)

    r_squared = r2_score(y_train, y_pred_)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)

    r2_score_train_list.append(r_squared)

    y_predict_train_list.append(y_pred_)


x = [numerator / denominator for (numerator, denominator) in irreducible_fractions]



# plot the r2 score curve over exponent
fig, ax = plt.subplots(figsize=(7.5, 5))  # Adjust the values as desired

# Plot the R-squared scores
ax.plot(x, r2_score_train_list, label='Train R-squared')
ax.plot(x, r2_score_test_list, label='Test R-squared')

# Set labels and title with font size
ax.set_xlabel('Exponent in Power Law', fontsize=14)
ax.set_ylabel('R-squared', fontsize=14)
ax.set_title('R-squared Scores', fontsize=16)

# Set tick label font size
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Set legend
ax.legend(fontsize=12)

fig.savefig('Power_Law_r2_scores_plot.png')

# Display the plot
plt.show()
-------- 
 (numerator, denominator) is: (1, 95)
Mean squared error: 0.012553698090272793
Correlation coefficient: 0.5492586362659945
Coefficient of determination (R-squared score, R2 score): -0.01178350132389605
-------- 
 (numerator, denominator) is: (30, 43)
Mean squared error: 0.007927645253546677
Correlation coefficient: 0.6140764914279518
Coefficient of determination (R-squared score, R2 score): 0.36105993515151313
-------- 
 (numerator, denominator) is: (179, 65)
Mean squared error: 0.0077922987749738555
Correlation coefficient: 0.6224739461963064
Coefficient of determination (R-squared score, R2 score): 0.3719683808539297
-------- 
 (numerator, denominator) is: (5, 1)
Mean squared error: 0.007801721831719279
Correlation coefficient: 0.6219708360133526
Coefficient of determination (R-squared score, R2 score): 0.37120891593144234
-------- 
 (numerator, denominator) is: (221, 33)
Mean squared error: 0.007804194315316004
Correlation coefficient: 0.6218374974348415
Coefficient of determination (R-squared score, R2 score): 0.37100964253069046
-------- 
 (numerator, denominator) is: (219, 23)
Mean squared error: 0.00780588812404452
Correlation coefficient: 0.6217442394863175
Coefficient of determination (R-squared score, R2 score): 0.37087312756007484
-------- 
 (numerator, denominator) is: (300, 17)
Mean squared error: 0.007807965450587863
Correlation coefficient: 0.6216287490698504
Coefficient of determination (R-squared score, R2 score): 0.37070570241504563
-------- 
 (numerator, denominator) is: (73, 3)
Mean squared error: 0.007809033771437306
Correlation coefficient: 0.6215582319310625
Coefficient of determination (R-squared score, R2 score): 0.37061959954704493
|||||||||||||||||||||||||||||||||||||
-------- 
 (numerator, denominator) is: (1, 95)
Mean squared error: 0.01045316636333401
Correlation coefficient: 0.4156817350556336
Coefficient of determination (R-squared score, R2 score): 0.03578426304422133
-------- 
 (numerator, denominator) is: (30, 43)
Mean squared error: 0.006160312058280804
Correlation coefficient: 0.6570871953391387
Coefficient of determination (R-squared score, R2 score): 0.43176358007773163
-------- 
 (numerator, denominator) is: (179, 65)
Mean squared error: 0.0059664793888465975
Correlation coefficient: 0.6705542454661818
Coefficient of determination (R-squared score, R2 score): 0.4496429961042646
-------- 
 (numerator, denominator) is: (5, 1)
Mean squared error: 0.005965096687271862
Correlation coefficient: 0.6706493409961944
Coefficient of determination (R-squared score, R2 score): 0.44977053857049276
-------- 
 (numerator, denominator) is: (221, 33)
Mean squared error: 0.0059651110041034
Correlation coefficient: 0.6706483564323005
Coefficient of determination (R-squared score, R2 score): 0.44976921796448766
-------- 
 (numerator, denominator) is: (219, 23)
Mean squared error: 0.005965205069629524
Correlation coefficient: 0.6706418874831295
Coefficient of determination (R-squared score, R2 score): 0.44976054121933307
-------- 
 (numerator, denominator) is: (300, 17)
Mean squared error: 0.00596536777619391
Correlation coefficient: 0.6706306978729792
Coefficient of determination (R-squared score, R2 score): 0.449745532921901
-------- 
 (numerator, denominator) is: (73, 3)
Mean squared error: 0.005965431990408115
Correlation coefficient: 0.6706262817005308
Coefficient of determination (R-squared score, R2 score): 0.4497396097065115
In [ ]:
# for model in model_list:
#     print(model.exponent_numerator, model.exponent_denominator)

Evaluate¶

In [ ]:
# both the old and new versions (in the last subsection "Fit and predict") of code
# share the same evaluate code in this subsectoon.

# Create and update multiple figures (test)
for y_pred, (numerator, denominator) in zip(y_predict_test_list, irreducible_fractions):
    plot_comparison(y_test, y_pred, f'Power-Law Regression Exponent={numerator} over {denominator}, Test Set')
    # here pay attention: cannot use / replace over in the name, otherwise cannot save the fig
    # because / cannot be in a file name.
    print(y_pred[0])
    clear_output(wait=True)  # Clear the previous output
0.024177814296192267
In [ ]:
# Create and update multiple figures (train)
for y_pred_, (numerator, denominator) in zip(y_predict_train_list, irreducible_fractions):
    plot_comparison(y_train, y_pred_, f'Power-Law Regression Exponent={numerator} over {denominator}, Train Set')
    print(y_pred_[0])
    clear_output(wait=True)  # Clear the previous output
0.08987348912424292

Exponential regression¶

Let $B = (b_1, b_2, \ldots , b_N)$. $$ y = A \cdot e^{(b_1 \cdot x_1 + \ldots + b_N \cdot x_N)} + C $$

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
x_train shape: (1368, 25)
y_train shape: (1368,)
x_test shape: (72, 25)
y_test shape: (72,)

Fit and predict¶

In [ ]:
# Define the model function
def func(X, *params):
    A, C = params[:2]
    return A * np.exp(np.sum(X * np.array(params[2:]), axis=1)) + C

# give the initial params using linear regression
# so that the params are within a reasonable range
num_features = x_train.shape[1]
initial_params = [np.mean(y_train), 0] + [0] * num_features # Initial parameter guesses

# Perform the curve fit
params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)

# Print the fitted parameters
print("Fitted Parameters:", params)

# predict on test
A, C = params[:2]
y_pred = A * np.exp(np.sum(x_test * np.array(params[2:]), axis=1)) + C

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

# predict on train
A, C = params[:2]
y_pred_ = A * np.exp(np.sum(x_train * np.array(params[2:]), axis=1)) + C

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

r2_score_train_list.append(r_squared)

# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
print(y_pred)
print(y_pred_test_digital)
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Fitted Parameters: [ 1.87123029e-01 -1.90080285e-01  3.40033178e-02  3.05772821e-02
 -2.49194815e-03  4.85530544e-05  1.95783707e-02  2.48969634e-02
  8.90102191e-03  9.83075781e-02  6.65559330e-03 -3.03907795e-02
  1.30905974e-03  1.15904357e-01  4.00205951e-02 -2.72080362e-02
  6.22504795e-03  2.48130071e-02  3.73831850e-04  1.26144436e-01
 -4.97544159e-02  4.56011391e-02 -1.34034892e-01 -3.74403443e-03
  2.63216376e-01 -7.38379867e-02 -1.21397447e+00]
Mean squared error: 0.007809965919858963
Correlation coefficient: 0.6215183575736783
Coefficient of determination (R-squared score, R2 score): 0.3705444716933213
Mean squared error: 0.005965612679987845
Correlation coefficient: 0.6706138550826338
Coefficient of determination (R-squared score, R2 score): 0.44972294262878365
---- ---- ----
[ 0.02425583  0.14706718  0.12366429  0.0786443   0.04049227  0.23597745
 -0.01620631  0.13441449  0.07236053  0.21618206 -0.01070742  0.06076135
 -0.0189932   0.02788252 -0.0500517  -0.01343889 -0.05818453  0.11900008
  0.09522372  0.00959078  0.03787487  0.13463547  0.0192085   0.05924506
  0.01095437  0.06465591  0.08565256  0.02403056  0.0302071   0.10111491
 -0.03160033 -0.01100594  0.16379375 -0.00697092  0.05063293 -0.03982638
  0.29517585  0.00586459  0.05913923 -0.01925285 -0.03112963  0.05716643
  0.0669799   0.05498539  0.01261709 -0.01698296  0.16587962  0.20025347
  0.02612595  0.01466163  0.04731695  0.03795217  0.03130737  0.08110612
  0.01362295  0.09130886  0.04140823 -0.08651457  0.18025977 -0.03097426
 -0.01282914  0.02221508  0.02125931 -0.00187638  0.10262163 -0.01400319
 -0.01616673  0.11696926  0.04167538 -0.00686065  0.08243136  0.059111  ]
[4 6 6 5 4 8 3 6 5 7 4 5 3 4 3 3 3 6 5 4 4 6 4 5 4 5 5 4 4 5 3 4 7 4 5 3 9
 4 5 3 3 5 5 5 4 3 7 7 4 4 5 4 4 5 4 5 4 2 7 3 3 4 4 4 6 3 3 6 4 4 5 5]
Mean squared error: 2.6527777777777777
Correlation coefficient: 0.5985681611539189
Coefficient of determination (R-squared score, R2 score): 0.32716864817261115
Mean squared error: 1.9305555555555556
Correlation coefficient: 0.6558085630583136
Coefficient of determination (R-squared score, R2 score): 0.4291988756148981

Evaluate¶

In [ ]:
plot_comparison(y_test, y_pred, 'Exponential Regression, Test Set')
In [ ]:
plot_comparison(y_train, y_pred_, 'Exponential Regression, Train Set')

Logistic regression¶

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

class_num = 480

# dependent data (labels/targets)
y = np.squeeze(stacked_red)
# print(np.max(y), np.min(y))

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
# print(intervals)
# Digitize the array to get the indices of the intervals
y_train = np.digitize(y_train, intervals) - 1
y_test = np.digitize(y_test, intervals) - 1
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# to see unique elements (see if we have all 0, 1,..., class_num-1 classes, better close to all)
unique_elements = np.unique(y_train)
print("Unique elements:", unique_elements)
print("Number of unique elements:", len(unique_elements))
x_train shape: (1368, 25)
y_train shape: (1368,)
x_test shape: (72, 25)
y_test shape: (72,)
Unique elements: [  0   7  27  36  38  40  43  50  52  54  55  56  57  58  60  61  62  63
  65  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 210
 211 212 213 214 215 216 217 218 219 220 222 223 224 226 227 228 229 230
 231 232 233 234 235 237 238 240 242 243 244 245 246 247 250 251 253 254
 255 256 257 258 259 260 263 265 266 267 268 269 270 272 273 274 277 280
 283 285 286 287 288 290 291 293 294 300 310 311 315 322 323 326 327 329
 332 334 335 336 339 340 343 349 356 357 360 369 380 383 389 391 426 475
 479]
Number of unique elements: 253

Fit and predict¶

'multinomial' (default option for multi-calss) achieves better performance than 'ovr'.

In [ ]:
# fit
model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, multi_class='multinomial')
# model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, multi_class='ovr')

fit_result = model.fit(x_train, y_train)
print(fit_result.intercept_.shape, fit_result.coef_.shape)
print("Coefficients:", model.coef_[0,:])
print("Intercept:", model.intercept_[0])
print('--- --- ---')

# predict
# Use the trained model to make predictions
y_pred = model.predict(x_test)
# Alternatively, you can get the predicted probabilities for each class
y_prob = model.predict_proba(x_test)

print('y_prob.shape:', y_prob.shape)
print(np.sum(y_prob, axis = 1))
print(y_prob[0,:])

# Print the predicted class labels
print('y_pred:', y_pred)
print('y_test:', y_test)
print('y_pred shape:', y_pred.shape, 'y_test shape:', y_test.shape)
# Print the predicted probabilities
# print(y_prob)
(253,) (253, 25)
Coefficients: [-0.56105226 -0.62859538 -0.46179994 -0.6925927  -0.30706826 -0.01190705
 -0.45737944 -0.17699687 -0.62488952  0.51399664 -0.34089166 -0.34399414
  0.27368915 -0.11936503 -0.22297636  0.02269972 -0.35553952 -0.40214045
 -0.00632174 -0.31621744 -0.016341    0.06506962 -0.07248956 -0.05043361
 -0.02148207]
Intercept: -0.8575836013108327
--- --- ---
y_prob.shape: (72, 253)
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[3.00513727e-06 1.78744497e-05 4.74271558e-06 2.26308841e-06
 4.53582848e-04 2.95953122e-04 3.82431244e-04 6.93949432e-04
 5.98212044e-04 4.66303629e-03 4.76670086e-06 1.50275557e-03
 9.30465729e-03 4.59748348e-05 1.23666672e-05 3.81004232e-06
 5.58735220e-07 4.89590297e-05 1.94336182e-03 2.18697241e-05
 1.18488676e-04 2.35866565e-04 8.48631165e-04 9.48281252e-05
 4.13887514e-04 4.22881901e-04 7.18182737e-05 8.54745095e-04
 4.27335681e-04 1.29495961e-03 2.94954570e-04 5.47908929e-03
 4.90149592e-04 8.71618141e-04 9.76115172e-05 4.23398791e-03
 2.57026839e-04 4.12272636e-04 5.72641661e-05 4.95932025e-04
 4.97701787e-03 6.89949821e-03 1.67311130e-04 8.00211126e-03
 1.29303602e-03 2.03940831e-03 2.90252042e-04 8.60263501e-04
 1.25532128e-02 1.96396600e-03 2.94995906e-03 2.29869096e-03
 2.52056220e-03 2.54046320e-03 3.61508754e-03 6.41644563e-04
 1.87148723e-03 4.89574558e-03 5.02775497e-03 3.27392327e-02
 2.38527541e-02 2.46410302e-03 3.10847814e-03 1.21985752e-03
 2.46154288e-02 1.01599575e-02 6.83543333e-03 2.66300071e-02
 7.98466521e-03 1.13087216e-03 1.29043331e-02 2.38046011e-02
 7.47311391e-04 1.78326785e-02 6.14558292e-02 4.54307674e-03
 6.00180238e-03 3.50709096e-03 1.88260156e-02 2.28465105e-02
 5.53181977e-03 1.14844449e-02 3.04061894e-03 5.59138033e-02
 2.47701263e-02 2.53208946e-02 4.48656516e-02 6.83942450e-03
 7.49985832e-03 1.06915049e-02 1.36389724e-02 4.66767982e-03
 8.07249715e-03 5.11675482e-03 2.06943731e-03 5.82958060e-03
 5.76255114e-03 4.58914148e-03 4.66574638e-04 2.48459793e-03
 3.03024389e-02 1.14737057e-02 1.48355678e-03 8.96805350e-03
 4.27718262e-03 2.55261125e-02 6.87853119e-03 6.73633047e-03
 2.78849055e-03 1.48657664e-03 9.20821197e-03 1.48883412e-03
 5.71632058e-03 2.69526177e-03 1.25210487e-02 1.70457261e-02
 5.40348402e-03 7.54221885e-03 3.17343122e-03 4.04628878e-03
 1.59134015e-03 3.38044325e-03 1.17740638e-02 2.06243969e-02
 7.68512392e-03 1.65720427e-03 1.67597704e-03 7.47441136e-04
 9.40485225e-03 8.09834810e-04 3.09079655e-04 4.60063811e-03
 3.60651458e-04 2.01370207e-03 9.16105334e-05 1.01061821e-03
 1.53768067e-02 1.78599908e-03 1.89423123e-03 3.66742299e-04
 1.80925433e-04 5.71428042e-04 4.47115486e-03 6.57157329e-03
 2.31219664e-03 1.35079367e-03 2.74038945e-04 3.83427767e-03
 3.90998872e-03 4.96950620e-04 1.15381962e-03 5.20356131e-05
 4.88319433e-05 5.54816585e-04 1.30305277e-03 1.43730614e-03
 3.72037874e-04 2.74543329e-04 2.15903026e-03 1.57887188e-03
 1.70177036e-03 1.72214331e-03 9.47859292e-05 1.02799712e-04
 7.27857561e-07 1.70807550e-04 1.70738087e-04 2.66587945e-04
 3.15831384e-04 7.13953839e-04 2.21621458e-04 4.10680136e-03
 1.05631355e-04 4.27200006e-05 2.97115519e-03 1.61543879e-05
 5.09689647e-03 1.05839674e-04 8.75555582e-06 2.42098071e-06
 8.42593081e-04 9.57099524e-04 3.75626604e-05 3.12464106e-04
 5.86684109e-06 2.38019419e-06 3.54418016e-04 1.41452320e-07
 1.89816390e-04 1.82709282e-03 1.59873894e-03 1.32139610e-06
 9.82859548e-05 4.05675495e-06 3.13274180e-06 4.61319716e-04
 1.41501606e-06 5.31509642e-05 3.93780310e-05 9.95147588e-04
 2.58805822e-03 6.14909947e-06 2.78893982e-04 2.96500232e-04
 1.62679259e-05 4.35621408e-06 5.95755878e-04 4.45307065e-04
 4.42199814e-04 1.65588637e-04 1.40511540e-04 1.25233039e-04
 9.10654471e-06 6.53158492e-08 1.26363834e-02 3.57053876e-06
 4.92587186e-07 6.58464290e-07 1.28392972e-05 3.16635650e-06
 2.87248256e-05 2.20759364e-04 1.59761185e-05 4.88102647e-05
 3.06700085e-05 2.31340711e-04 2.47036710e-04 1.75085773e-04
 3.08114917e-05 5.16169235e-05 1.76352269e-04 1.28049604e-05
 3.12686717e-06 4.18501672e-05 2.25692519e-06 7.24950771e-06
 2.75336483e-05 2.66506514e-05 1.60810997e-04 5.04695259e-05
 2.88458128e-05 1.87919281e-04 2.78863401e-05 8.48976500e-05
 2.16008788e-05 1.85055177e-06 8.44840014e-06 5.90515843e-04
 9.90753216e-05 2.30940303e-05 1.87307075e-04 6.89947046e-05
 3.83568617e-06]
y_pred: [122 168 182 158 129 293 132 244 223 291 122 161 110 108 132  84  78 138
 211 145 131 199 116 129 130 184 163 137 165 153 141 126 188 113 148  75
 327 166 131 117 108 134 120 124 132 141 184 211 196 127 130 138  97 144
 134 129 126 103 183 145 123 135 145 106 162 145 119 171 138 122 184 170]
y_test: [195 131 252 352 130 217 105 183 176 281 157 180  79 165 107  86 109 128
 254 299 149 219 127 161 115 160 139 143  97 185 152 134 189 128 113 101
 306 160 142 122 122 124 300 124 124 137 176 282 115 178  99 150 117 151
 160 222 189 108 179  76 189 112 120  93 150 136  85 221 170 118 160 163]
y_pred shape: (72,) y_test shape: (72,)

Evaluate¶

Evaluate (normal)¶

In previous data division, I classify data into class_num (e.g., class_num = 160) intervals (hitogram, by np.digitize). Here, evaluate the results with the same number of classes (e.g., class_num = 160).

In [ ]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:") # y_test doesn't include all classes, so confusion matrix is less than num_class by num_class
print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#           |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)


# Sort y_pred and y_test based on y_test
plot_comparison(y_test, y_pred, 'Logistic Linear Regression, Test Set')
Accuracy: 0.013888888888888888
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Mean squared error: 2563.9444444444443
Correlation coefficient: 0.5661491222250555
Coefficient of determination (R-squared score, R2 score): 0.24678329168154234
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = model.predict(x_train)
# Alternatively, you can get the predicted probabilities for each class
y_prob_ = model.predict_proba(x_train)

accuracy = accuracy_score(y_train, y_pred_)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_train, y_pred_)
print("Confusion Matrix:")
print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#              |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Logistic Linear Regression, Train Set')
Accuracy: 0.2361111111111111
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Mean squared error: 1427.1834795321638
Correlation coefficient: 0.7559823206669919
Coefficient of determination (R-squared score, R2 score): 0.5203541026650583
Evaluate (reduced)¶

The model is based on classifying data into class_num (e.g., class_num = 160) intervals (hitogram, by np.digitize). Here, evaluate the results a samller number of classes (e.g., reduced_class_num = 16), that is, for the example of class_num = 160 and reduced_class_num = 16, classes 0, 1, ..., 15 become one class, i.e., 0; ...; classes 144, 145, ..., 159 become one class, i.e., 15.

In [ ]:
print("---- ---- ----")
# Define the boundaries for digitization
reduced_class_num = 16
intervals = np.arange(0, class_num + 1, class_num / reduced_class_num)
print(intervals)

y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
print(y_pred)
print(y_pred_test_digital)
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
[  0.  30.  60.  90. 120. 150. 180. 210. 240. 270. 300. 330. 360. 390.
 420. 450. 480.]
[122 168 182 158 129 293 132 244 223 291 122 161 110 108 132  84  78 138
 211 145 131 199 116 129 130 184 163 137 165 153 141 126 188 113 148  75
 327 166 131 117 108 134 120 124 132 141 184 211 196 127 130 138  97 144
 134 129 126 103 183 145 123 135 145 106 162 145 119 171 138 122 184 170]
[ 4  5  6  5  4  9  4  8  7  9  4  5  3  3  4  2  2  4  7  4  4  6  3  4
  4  6  5  4  5  5  4  4  6  3  4  2 10  5  4  3  3  4  4  4  4  4  6  7
  6  4  4  4  3  4  4  4  4  3  6  4  4  4  4  3  5  4  3  5  4  4  6  5]
Mean squared error: 2.7222222222222223
Correlation coefficient: 0.6060794440849583
Coefficient of determination (R-squared score, R2 score): 0.3095552619991193
Mean squared error: 1.7105263157894737
Correlation coefficient: 0.7442081479296433
Coefficient of determination (R-squared score, R2 score): 0.49425421012452153

Fixing the reduced class number, I enumerate the original class number to see what a original class number is better.

In [ ]:
# independent data
x = group_satcked_green
# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

reduced_class_num = 16
class_num_array = np.arange(reduced_class_num, reduced_class_num * 100 + 1, reduced_class_num)
mse_test_list = []
correlation_test_list = []
r_squared_test_list = []
mse_train_list = []
correlation_train_list = []
r_squared_train_list = []

for class_num in class_num_array:
    print('---- ---- ----')
    print(f'class_num = {class_num}')

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)

    # Generate class_num+1 evenly spaced intervals
    intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
    # print(intervals)
    # Digitize the array to get the indices of the intervals
    y_train = np.digitize(y_train, intervals) - 1
    y_test = np.digitize(y_test, intervals) - 1

    # to see unique elements (see if we have all 0, 1,..., class_num-1 classes, better close to all)
    unique_elements = np.unique(y_train)
    # print("Unique elements:", unique_elements)
    print("Number of unique elements:", len(unique_elements))

    model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, multi_class='multinomial')
    fit_result = model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_pred_ = model.predict(x_train)

    # Define the boundaries for digitization
    intervals = np.arange(0, class_num+1, class_num/16)
    print(intervals)

    y_train_digital = np.digitize(y_train, intervals) - 1
    y_test_digital = np.digitize(y_test, intervals) - 1
    y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
    y_pred_test_digital = np.digitize(y_pred, intervals) - 1
    # print(y_pred)
    # print(y_pred_test_digital)
    print("test eval:")
    mse = mean_squared_error(y_test_digital, y_pred_test_digital)
    print("Mean squared error:", mse)
    correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
    print("Correlation coefficient:", correlation)
    r_squared = r2_score(y_test_digital, y_pred_test_digital)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)
    mse_test_list.append(mse)
    correlation_test_list.append(correlation)
    r_squared_test_list.append(r_squared)

    print("train eval:")
    mse = mean_squared_error(y_train_digital, y_pred_train_digital)
    print("Mean squared error:", mse)
    correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
    print("Correlation coefficient:", correlation)
    r_squared = r2_score(y_train_digital, y_pred_train_digital)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)
    mse_train_list.append(mse)
    correlation_train_list.append(correlation)
    r_squared_train_list.append(r_squared)

    plot_comparison(y_test, y_pred, f'Logistic Linear Regression Reduced Evaluation {class_num} to {reduced_class_num}, Test Set')
    plot_comparison(y_train, y_pred_, f'Logistic Linear Regression Reduced Evaluation {class_num} to {reduced_class_num}, Train Set')
x_train shape: (1368, 25)
y_train shape: (1368,)
x_test shape: (72, 25)
y_test shape: (72,)
---- ---- ----
class_num = 16
Number of unique elements: 16
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.]
test eval:
Mean squared error: 3.5972222222222223
Correlation coefficient: 0.47198973344177797
Coefficient of determination (R-squared score, R2 score): 0.08762659621312197
train eval:
Mean squared error: 2.2989766081871346
Correlation coefficient: 0.5910190720855697
Coefficient of determination (R-squared score, R2 score): 0.32026901318017964
---- ---- ----
class_num = 32
Number of unique elements: 29
[ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18. 20. 22. 24. 26. 28. 30. 32.]
test eval:
Mean squared error: 3.236111111111111
Correlation coefficient: 0.5357558077876284
Coefficient of determination (R-squared score, R2 score): 0.17921620431527963
train eval:
Mean squared error: 2.2346491228070176
Correlation coefficient: 0.6136660388859967
Coefficient of determination (R-squared score, R2 score): 0.33928851297036855
---- ---- ----
class_num = 48
Number of unique elements: 40
[ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27. 30. 33. 36. 39. 42. 45. 48.]
test eval:
Mean squared error: 3.5555555555555554
Correlation coefficient: 0.4803971320092436
Coefficient of determination (R-squared score, R2 score): 0.09819462791721711
train eval:
Mean squared error: 2.245614035087719
Correlation coefficient: 0.6144510013822345
Coefficient of determination (R-squared score, R2 score): 0.3360465527788591
---- ---- ----
class_num = 64
Number of unique elements: 54
[ 0.  4.  8. 12. 16. 20. 24. 28. 32. 36. 40. 44. 48. 52. 56. 60. 64.]
test eval:
Mean squared error: 3.4305555555555554
Correlation coefficient: 0.503739746255114
Coefficient of determination (R-squared score, R2 score): 0.12989872302950245
train eval:
Mean squared error: 2.111111111111111
Correlation coefficient: 0.6485234829384348
Coefficient of determination (R-squared score, R2 score): 0.3758145977947087
---- ---- ----
class_num = 80
Number of unique elements: 64
[ 0.  5. 10. 15. 20. 25. 30. 35. 40. 45. 50. 55. 60. 65. 70. 75. 80.]
test eval:
Mean squared error: 4.180555555555555
Correlation coefficient: 0.4427520028671864
Coefficient of determination (R-squared score, R2 score): -0.060325847644209674
train eval:
Mean squared error: 2.215643274853801
Correlation coefficient: 0.6381027090493488
Coefficient of determination (R-squared score, R2 score): 0.3449079106356516
---- ---- ----
class_num = 96
Number of unique elements: 71
[ 0.  6. 12. 18. 24. 30. 36. 42. 48. 54. 60. 66. 72. 78. 84. 90. 96.]
test eval:
Mean squared error: 3.3472222222222223
Correlation coefficient: 0.504180754959501
Coefficient of determination (R-squared score, R2 score): 0.15103478643769264
train eval:
Mean squared error: 2.1703216374269005
Correlation coefficient: 0.6520581653648752
Coefficient of determination (R-squared score, R2 score): 0.35830801276055746
---- ---- ----
class_num = 112
Number of unique elements: 84
[  0.   7.  14.  21.  28.  35.  42.  49.  56.  63.  70.  77.  84.  91.
  98. 105. 112.]
test eval:
Mean squared error: 3.4722222222222223
Correlation coefficient: 0.5037710872095293
Coefficient of determination (R-squared score, R2 score): 0.1193306913254073
train eval:
Mean squared error: 2.1907894736842106
Correlation coefficient: 0.6513815613624006
Coefficient of determination (R-squared score, R2 score): 0.3522563537364065
---- ---- ----
class_num = 128
Number of unique elements: 94
[  0.   8.  16.  24.  32.  40.  48.  56.  64.  72.  80.  88.  96. 104.
 112. 120. 128.]
test eval:
Mean squared error: 3.375
Correlation coefficient: 0.5033317430794849
Coefficient of determination (R-squared score, R2 score): 0.14398943196829594
train eval:
Mean squared error: 1.9546783625730995
Correlation coefficient: 0.6898003069787897
Coefficient of determination (R-squared score, R2 score): 0.42206656319357727
---- ---- ----
class_num = 144
Number of unique elements: 103
[  0.   9.  18.  27.  36.  45.  54.  63.  72.  81.  90.  99. 108. 117.
 126. 135. 144.]
test eval:
Mean squared error: 3.736111111111111
Correlation coefficient: 0.4735339459159608
Coefficient of determination (R-squared score, R2 score): 0.052399823866138284
train eval:
Mean squared error: 2.0826023391812867
Correlation coefficient: 0.6695835615867002
Coefficient of determination (R-squared score, R2 score): 0.3842436942926333
---- ---- ----
class_num = 160
Number of unique elements: 112
[  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100. 110. 120. 130.
 140. 150. 160.]
test eval:
Mean squared error: 3.2916666666666665
Correlation coefficient: 0.5242259606046868
Coefficient of determination (R-squared score, R2 score): 0.16512549537648613
train eval:
Mean squared error: 2.1271929824561404
Correlation coefficient: 0.669450391726396
Coefficient of determination (R-squared score, R2 score): 0.3710597228471615
---- ---- ----
class_num = 176
Number of unique elements: 119
[  0.  11.  22.  33.  44.  55.  66.  77.  88.  99. 110. 121. 132. 143.
 154. 165. 176.]
test eval:
Mean squared error: 3.0555555555555554
Correlation coefficient: 0.5434058877638174
Coefficient of determination (R-squared score, R2 score): 0.22501100836635846
train eval:
Mean squared error: 1.8801169590643274
Correlation coefficient: 0.7027983256653713
Coefficient of determination (R-squared score, R2 score): 0.4441118924958417
---- ---- ----
class_num = 192
Number of unique elements: 129
[  0.  12.  24.  36.  48.  60.  72.  84.  96. 108. 120. 132. 144. 156.
 168. 180. 192.]
test eval:
Mean squared error: 2.9722222222222223
Correlation coefficient: 0.5613286852055313
Coefficient of determination (R-squared score, R2 score): 0.24614707177454864
train eval:
Mean squared error: 2.068713450292398
Correlation coefficient: 0.6795194596695697
Coefficient of determination (R-squared score, R2 score): 0.3883501772018787
---- ---- ----
class_num = 208
Number of unique elements: 135
[  0.  13.  26.  39.  52.  65.  78.  91. 104. 117. 130. 143. 156. 169.
 182. 195. 208.]
test eval:
Mean squared error: 3.263888888888889
Correlation coefficient: 0.5167605713638903
Coefficient of determination (R-squared score, R2 score): 0.17217084984588282
train eval:
Mean squared error: 1.9714912280701755
Correlation coefficient: 0.6990042154331321
Coefficient of determination (R-squared score, R2 score): 0.417095557566596
---- ---- ----
class_num = 224
Number of unique elements: 144
[  0.  14.  28.  42.  56.  70.  84.  98. 112. 126. 140. 154. 168. 182.
 196. 210. 224.]
test eval:
Mean squared error: 3.2083333333333335
Correlation coefficient: 0.5437377502756681
Coefficient of determination (R-squared score, R2 score): 0.18626155878467632
train eval:
Mean squared error: 2.017543859649123
Correlation coefficient: 0.6953029871474389
Coefficient of determination (R-squared score, R2 score): 0.40347932476225623
---- ---- ----
class_num = 240
Number of unique elements: 151
[  0.  15.  30.  45.  60.  75.  90. 105. 120. 135. 150. 165. 180. 195.
 210. 225. 240.]
test eval:
Mean squared error: 3.2916666666666665
Correlation coefficient: 0.5194551289734144
Coefficient of determination (R-squared score, R2 score): 0.16512549537648613
train eval:
Mean squared error: 2.067982456140351
Correlation coefficient: 0.6873768774083306
Coefficient of determination (R-squared score, R2 score): 0.38856630788131263
---- ---- ----
class_num = 256
Number of unique elements: 162
[  0.  16.  32.  48.  64.  80.  96. 112. 128. 144. 160. 176. 192. 208.
 224. 240. 256.]
test eval:
Mean squared error: 3.361111111111111
Correlation coefficient: 0.5009248047595136
Coefficient of determination (R-squared score, R2 score): 0.1475121092029943
train eval:
Mean squared error: 1.8940058479532165
Correlation coefficient: 0.7093807333281182
Coefficient of determination (R-squared score, R2 score): 0.4400054095865963
---- ---- ----
class_num = 272
Number of unique elements: 168
[  0.  17.  34.  51.  68.  85. 102. 119. 136. 153. 170. 187. 204. 221.
 238. 255. 272.]
test eval:
Mean squared error: 3.5277777777777777
Correlation coefficient: 0.48355826570450633
Coefficient of determination (R-squared score, R2 score): 0.1052399823866138
train eval:
Mean squared error: 1.8157894736842106
Correlation coefficient: 0.728019282425673
Coefficient of determination (R-squared score, R2 score): 0.4631313922860306
---- ---- ----
class_num = 288
Number of unique elements: 174
[  0.  18.  36.  54.  72.  90. 108. 126. 144. 162. 180. 198. 216. 234.
 252. 270. 288.]
test eval:
Mean squared error: 3.5277777777777777
Correlation coefficient: 0.4899766682551543
Coefficient of determination (R-squared score, R2 score): 0.1052399823866138
train eval:
Mean squared error: 2.021198830409357
Correlation coefficient: 0.6856906934944299
Coefficient of determination (R-squared score, R2 score): 0.40239867136508645
---- ---- ----
class_num = 304
Number of unique elements: 184
[  0.  19.  38.  57.  76.  95. 114. 133. 152. 171. 190. 209. 228. 247.
 266. 285. 304.]
test eval:
Mean squared error: 3.0694444444444446
Correlation coefficient: 0.545368270536066
Coefficient of determination (R-squared score, R2 score): 0.2214883311316601
train eval:
Mean squared error: 1.9239766081871346
Correlation coefficient: 0.7049550788995437
Coefficient of determination (R-squared score, R2 score): 0.43114405172980375
---- ---- ----
class_num = 320
Number of unique elements: 191
[  0.  20.  40.  60.  80. 100. 120. 140. 160. 180. 200. 220. 240. 260.
 280. 300. 320.]
test eval:
Mean squared error: 3.4166666666666665
Correlation coefficient: 0.49449225822246445
Coefficient of determination (R-squared score, R2 score): 0.1334214002642008
train eval:
Mean squared error: 2.08187134502924
Correlation coefficient: 0.6930186726826559
Coefficient of determination (R-squared score, R2 score): 0.3844598249720673
---- ---- ----
class_num = 336
Number of unique elements: 197
[  0.  21.  42.  63.  84. 105. 126. 147. 168. 189. 210. 231. 252. 273.
 294. 315. 336.]
test eval:
Mean squared error: 3.8333333333333335
Correlation coefficient: 0.44750228465446723
Coefficient of determination (R-squared score, R2 score): 0.02774108322324964
train eval:
Mean squared error: 1.7997076023391814
Correlation coefficient: 0.7278634370384258
Coefficient of determination (R-squared score, R2 score): 0.46788626723357785
---- ---- ----
class_num = 352
Number of unique elements: 205
[  0.  22.  44.  66.  88. 110. 132. 154. 176. 198. 220. 242. 264. 286.
 308. 330. 352.]
test eval:
Mean squared error: 3.5694444444444446
Correlation coefficient: 0.48227115458952186
Coefficient of determination (R-squared score, R2 score): 0.09467195068251877
train eval:
Mean squared error: 1.7675438596491229
Correlation coefficient: 0.7300415728003163
Coefficient of determination (R-squared score, R2 score): 0.4773960171286723
---- ---- ----
class_num = 368
Number of unique elements: 212
[  0.  23.  46.  69.  92. 115. 138. 161. 184. 207. 230. 253. 276. 299.
 322. 345. 368.]
test eval:
Mean squared error: 3.125
Correlation coefficient: 0.5332746426444116
Coefficient of determination (R-squared score, R2 score): 0.20739762219286662
train eval:
Mean squared error: 1.9649122807017543
Correlation coefficient: 0.6962281852214511
Coefficient of determination (R-squared score, R2 score): 0.41904073368150174
---- ---- ----
class_num = 384
Number of unique elements: 219
[  0.  24.  48.  72.  96. 120. 144. 168. 192. 216. 240. 264. 288. 312.
 336. 360. 384.]
test eval:
Mean squared error: 3.4722222222222223
Correlation coefficient: 0.5059495216654009
Coefficient of determination (R-squared score, R2 score): 0.1193306913254073
train eval:
Mean squared error: 1.641812865497076
Correlation coefficient: 0.7550314567695265
Coefficient of determination (R-squared score, R2 score): 0.5145704939913143
---- ---- ----
class_num = 400
Number of unique elements: 224
[  0.  25.  50.  75. 100. 125. 150. 175. 200. 225. 250. 275. 300. 325.
 350. 375. 400.]
test eval:
Mean squared error: 3.638888888888889
Correlation coefficient: 0.48420812404275726
Coefficient of determination (R-squared score, R2 score): 0.07705856450902682
train eval:
Mean squared error: 1.601608187134503
Correlation coefficient: 0.757055265058668
Coefficient of determination (R-squared score, R2 score): 0.5264576813601825
---- ---- ----
class_num = 416
Number of unique elements: 226
[  0.  26.  52.  78. 104. 130. 156. 182. 208. 234. 260. 286. 312. 338.
 364. 390. 416.]
test eval:
Mean squared error: 3.388888888888889
Correlation coefficient: 0.4852968909152171
Coefficient of determination (R-squared score, R2 score): 0.1404667547335975
train eval:
Mean squared error: 1.706140350877193
Correlation coefficient: 0.7477674868813875
Coefficient of determination (R-squared score, R2 score): 0.49555099420112536
---- ---- ----
class_num = 432
Number of unique elements: 234
[  0.  27.  54.  81. 108. 135. 162. 189. 216. 243. 270. 297. 324. 351.
 378. 405. 432.]
test eval:
Mean squared error: 3.6527777777777777
Correlation coefficient: 0.47036606231027916
Coefficient of determination (R-squared score, R2 score): 0.07353588727432847
train eval:
Mean squared error: 1.861842105263158
Correlation coefficient: 0.7191920076627579
Coefficient of determination (R-squared score, R2 score): 0.44951515948169074
---- ---- ----
class_num = 448
Number of unique elements: 243
[  0.  28.  56.  84. 112. 140. 168. 196. 224. 252. 280. 308. 336. 364.
 392. 420. 448.]
test eval:
Mean squared error: 3.0833333333333335
Correlation coefficient: 0.5250607650409068
Coefficient of determination (R-squared score, R2 score): 0.21796565389696165
train eval:
Mean squared error: 1.8011695906432748
Correlation coefficient: 0.7347557280791996
Coefficient of determination (R-squared score, R2 score): 0.46745400587470987
---- ---- ----
class_num = 464
Number of unique elements: 243
[  0.  29.  58.  87. 116. 145. 174. 203. 232. 261. 290. 319. 348. 377.
 406. 435. 464.]
test eval:
Mean squared error: 3.013888888888889
Correlation coefficient: 0.5562252199875786
Coefficient of determination (R-squared score, R2 score): 0.2355790400704535
train eval:
Mean squared error: 1.712719298245614
Correlation coefficient: 0.7451863485220609
Coefficient of determination (R-squared score, R2 score): 0.4936058180862197
---- ---- ----
class_num = 480
Number of unique elements: 253
[  0.  30.  60.  90. 120. 150. 180. 210. 240. 270. 300. 330. 360. 390.
 420. 450. 480.]
test eval:
Mean squared error: 2.7222222222222223
Correlation coefficient: 0.6060794440849583
Coefficient of determination (R-squared score, R2 score): 0.3095552619991193
train eval:
Mean squared error: 1.7105263157894737
Correlation coefficient: 0.7442081479296433
Coefficient of determination (R-squared score, R2 score): 0.49425421012452153
---- ---- ----
class_num = 496
Number of unique elements: 257
[  0.  31.  62.  93. 124. 155. 186. 217. 248. 279. 310. 341. 372. 403.
 434. 465. 496.]
test eval:
Mean squared error: 3.5
Correlation coefficient: 0.4832186801513632
Coefficient of determination (R-squared score, R2 score): 0.11228533685601061
train eval:
Mean squared error: 1.713450292397661
Correlation coefficient: 0.7396308046833006
Coefficient of determination (R-squared score, R2 score): 0.4933896874067857
---- ---- ----
class_num = 512
Number of unique elements: 265
[  0.  32.  64.  96. 128. 160. 192. 224. 256. 288. 320. 352. 384. 416.
 448. 480. 512.]
test eval:
Mean squared error: 3.4166666666666665
Correlation coefficient: 0.4900594619824944
Coefficient of determination (R-squared score, R2 score): 0.1334214002642008
train eval:
Mean squared error: 1.631578947368421
Correlation coefficient: 0.755311210710647
Coefficient of determination (R-squared score, R2 score): 0.5175963235033898
---- ---- ----
class_num = 528
Number of unique elements: 270
[  0.  33.  66.  99. 132. 165. 198. 231. 264. 297. 330. 363. 396. 429.
 462. 495. 528.]
test eval:
Mean squared error: 4.027777777777778
Correlation coefficient: 0.43736758254217567
Coefficient of determination (R-squared score, R2 score): -0.021576398062527424
train eval:
Mean squared error: 1.7470760233918128
Correlation coefficient: 0.7376111428489114
Coefficient of determination (R-squared score, R2 score): 0.48344767615282336
---- ---- ----
class_num = 544
Number of unique elements: 277
[  0.  34.  68. 102. 136. 170. 204. 238. 272. 306. 340. 374. 408. 442.
 476. 510. 544.]
test eval:
Mean squared error: 4.0
Correlation coefficient: 0.4100064508731889
Coefficient of determination (R-squared score, R2 score): -0.014531043593130732
train eval:
Mean squared error: 1.5314327485380117
Correlation coefficient: 0.7705055211445285
Coefficient of determination (R-squared score, R2 score): 0.5472062265858431
---- ---- ----
class_num = 560
Number of unique elements: 277
[  0.  35.  70. 105. 140. 175. 210. 245. 280. 315. 350. 385. 420. 455.
 490. 525. 560.]
test eval:
Mean squared error: 3.9166666666666665
Correlation coefficient: 0.47230871755082143
Coefficient of determination (R-squared score, R2 score): 0.006605019815059454
train eval:
Mean squared error: 1.6074561403508771
Correlation coefficient: 0.756754569333987
Coefficient of determination (R-squared score, R2 score): 0.5247286359247106
---- ---- ----
class_num = 576
Number of unique elements: 286
[  0.  36.  72. 108. 144. 180. 216. 252. 288. 324. 360. 396. 432. 468.
 504. 540. 576.]
test eval:
Mean squared error: 3.9722222222222223
Correlation coefficient: 0.42378310656099694
Coefficient of determination (R-squared score, R2 score): -0.007485689123734041
train eval:
Mean squared error: 1.6564327485380117
Correlation coefficient: 0.7506155099667133
Coefficient of determination (R-squared score, R2 score): 0.510247880402635
---- ---- ----
class_num = 592
Number of unique elements: 292
[  0.  37.  74. 111. 148. 185. 222. 259. 296. 333. 370. 407. 444. 481.
 518. 555. 592.]
test eval:
Mean squared error: 3.388888888888889
Correlation coefficient: 0.49399417848482213
Coefficient of determination (R-squared score, R2 score): 0.1404667547335975
train eval:
Mean squared error: 1.6542397660818713
Correlation coefficient: 0.7478476246087622
Coefficient of determination (R-squared score, R2 score): 0.5108962724409369
---- ---- ----
class_num = 608
Number of unique elements: 300
[  0.  38.  76. 114. 152. 190. 228. 266. 304. 342. 380. 418. 456. 494.
 532. 570. 608.]
test eval:
Mean squared error: 2.861111111111111
Correlation coefficient: 0.5654923211396736
Coefficient of determination (R-squared score, R2 score): 0.27432848965213563
train eval:
Mean squared error: 1.5
Correlation coefficient: 0.7737523401399934
Coefficient of determination (R-squared score, R2 score): 0.5564998458015036
---- ---- ----
class_num = 624
Number of unique elements: 304
[  0.  39.  78. 117. 156. 195. 234. 273. 312. 351. 390. 429. 468. 507.
 546. 585. 624.]
test eval:
Mean squared error: 3.6666666666666665
Correlation coefficient: 0.41626635730762723
Coefficient of determination (R-squared score, R2 score): 0.07001321003963012
train eval:
Mean squared error: 1.5402046783625731
Correlation coefficient: 0.7637629255483993
Coefficient of determination (R-squared score, R2 score): 0.5446126584326354
---- ---- ----
class_num = 640
Number of unique elements: 307
[  0.  40.  80. 120. 160. 200. 240. 280. 320. 360. 400. 440. 480. 520.
 560. 600. 640.]
test eval:
Mean squared error: 3.4444444444444446
Correlation coefficient: 0.4798743430106603
Coefficient of determination (R-squared score, R2 score): 0.1263760457948041
train eval:
Mean squared error: 1.6059941520467835
Correlation coefficient: 0.7600503856137294
Coefficient of determination (R-squared score, R2 score): 0.5251608972835786
---- ---- ----
class_num = 656
Number of unique elements: 309
[  0.  41.  82. 123. 164. 205. 246. 287. 328. 369. 410. 451. 492. 533.
 574. 615. 656.]
test eval:
Mean squared error: 3.8055555555555554
Correlation coefficient: 0.37497588643165786
Coefficient of determination (R-squared score, R2 score): 0.034786437692646444
train eval:
Mean squared error: 1.5467836257309941
Correlation coefficient: 0.7652131149397225
Coefficient of determination (R-squared score, R2 score): 0.5426674823177298
---- ---- ----
class_num = 672
Number of unique elements: 315
[  0.  42.  84. 126. 168. 210. 252. 294. 336. 378. 420. 462. 504. 546.
 588. 630. 672.]
test eval:
Mean squared error: 3.1666666666666665
Correlation coefficient: 0.5150190255186106
Coefficient of determination (R-squared score, R2 score): 0.19682959048877147
train eval:
Mean squared error: 1.560672514619883
Correlation coefficient: 0.7619757686949364
Coefficient of determination (R-squared score, R2 score): 0.5385609994084845
---- ---- ----
class_num = 688
Number of unique elements: 321
[  0.  43.  86. 129. 172. 215. 258. 301. 344. 387. 430. 473. 516. 559.
 602. 645. 688.]
test eval:
Mean squared error: 4.013888888888889
Correlation coefficient: 0.4005119454382824
Coefficient of determination (R-squared score, R2 score): -0.01805372082782908
train eval:
Mean squared error: 1.5482456140350878
Correlation coefficient: 0.7727192950685039
Coefficient of determination (R-squared score, R2 score): 0.5422352209588619
---- ---- ----
class_num = 704
Number of unique elements: 332
[  0.  44.  88. 132. 176. 220. 264. 308. 352. 396. 440. 484. 528. 572.
 616. 660. 704.]
test eval:
Mean squared error: 3.75
Correlation coefficient: 0.4418255075484136
Coefficient of determination (R-squared score, R2 score): 0.04887714663143994
train eval:
Mean squared error: 1.439327485380117
Correlation coefficient: 0.7839238700032121
Coefficient of determination (R-squared score, R2 score): 0.5744386921945226
---- ---- ----
class_num = 720
Number of unique elements: 327
[  0.  45.  90. 135. 180. 225. 270. 315. 360. 405. 450. 495. 540. 585.
 630. 675. 720.]
test eval:
Mean squared error: 3.486111111111111
Correlation coefficient: 0.4555483337428381
Coefficient of determination (R-squared score, R2 score): 0.11580801409070896
train eval:
Mean squared error: 1.5182748538011697
Correlation coefficient: 0.7715449215307933
Coefficient of determination (R-squared score, R2 score): 0.5510965788156543
---- ---- ----
class_num = 736
Number of unique elements: 337
[  0.  46.  92. 138. 184. 230. 276. 322. 368. 414. 460. 506. 552. 598.
 644. 690. 736.]
test eval:
Mean squared error: 4.208333333333333
Correlation coefficient: 0.41224302218437786
Coefficient of determination (R-squared score, R2 score): -0.06737120211360637
train eval:
Mean squared error: 1.662280701754386
Correlation coefficient: 0.7509674113487845
Coefficient of determination (R-squared score, R2 score): 0.5085188349671632
---- ---- ----
class_num = 752
Number of unique elements: 342
[  0.  47.  94. 141. 188. 235. 282. 329. 376. 423. 470. 517. 564. 611.
 658. 705. 752.]
test eval:
Mean squared error: 3.861111111111111
Correlation coefficient: 0.4226049614507438
Coefficient of determination (R-squared score, R2 score): 0.02069572875385295
train eval:
Mean squared error: 1.519736842105263
Correlation coefficient: 0.7694432377079866
Coefficient of determination (R-squared score, R2 score): 0.5506643174567865
---- ---- ----
class_num = 768
Number of unique elements: 355
[  0.  48.  96. 144. 192. 240. 288. 336. 384. 432. 480. 528. 576. 624.
 672. 720. 768.]
test eval:
Mean squared error: 3.5555555555555554
Correlation coefficient: 0.4702633221611583
Coefficient of determination (R-squared score, R2 score): 0.09819462791721711
train eval:
Mean squared error: 1.4407894736842106
Correlation coefficient: 0.78251053221998
Coefficient of determination (R-squared score, R2 score): 0.5740064308356547
---- ---- ----
class_num = 784
Number of unique elements: 351
[  0.  49.  98. 147. 196. 245. 294. 343. 392. 441. 490. 539. 588. 637.
 686. 735. 784.]
test eval:
Mean squared error: 3.2916666666666665
Correlation coefficient: 0.4910675336428644
Coefficient of determination (R-squared score, R2 score): 0.16512549537648613
train eval:
Mean squared error: 1.4663742690058479
Correlation coefficient: 0.7824074519633014
Coefficient of determination (R-squared score, R2 score): 0.5664418570554659
---- ---- ----
class_num = 800
Number of unique elements: 355
[  0.  50. 100. 150. 200. 250. 300. 350. 400. 450. 500. 550. 600. 650.
 700. 750. 800.]
test eval:
Mean squared error: 3.8333333333333335
Correlation coefficient: 0.4353159979326409
Coefficient of determination (R-squared score, R2 score): 0.02774108322324964
train eval:
Mean squared error: 1.5314327485380117
Correlation coefficient: 0.7676016320869605
Coefficient of determination (R-squared score, R2 score): 0.5472062265858431
---- ---- ----
class_num = 816
Number of unique elements: 365
[  0.  51. 102. 153. 204. 255. 306. 357. 408. 459. 510. 561. 612. 663.
 714. 765. 816.]
test eval:
Mean squared error: 3.25
Correlation coefficient: 0.5351553225029251
Coefficient of determination (R-squared score, R2 score): 0.17569352708058128
train eval:
Mean squared error: 1.4305555555555556
Correlation coefficient: 0.7833717170500211
Coefficient of determination (R-squared score, R2 score): 0.5770322603477303
---- ---- ----
class_num = 832
Number of unique elements: 369
[  0.  52. 104. 156. 208. 260. 312. 364. 416. 468. 520. 572. 624. 676.
 728. 780. 832.]
test eval:
Mean squared error: 3.1944444444444446
Correlation coefficient: 0.5262219237468545
Coefficient of determination (R-squared score, R2 score): 0.18978423601937477
train eval:
Mean squared error: 1.418859649122807
Correlation coefficient: 0.7891367199216424
Coefficient of determination (R-squared score, R2 score): 0.5804903512186737
---- ---- ----
class_num = 848
Number of unique elements: 374
[  0.  53. 106. 159. 212. 265. 318. 371. 424. 477. 530. 583. 636. 689.
 742. 795. 848.]
test eval:
Mean squared error: 3.2916666666666665
Correlation coefficient: 0.48057246999800624
Coefficient of determination (R-squared score, R2 score): 0.16512549537648613
train eval:
Mean squared error: 1.4144736842105263
Correlation coefficient: 0.787556821947226
Coefficient of determination (R-squared score, R2 score): 0.5817871352952775
---- ---- ----
class_num = 864
Number of unique elements: 376
[  0.  54. 108. 162. 216. 270. 324. 378. 432. 486. 540. 594. 648. 702.
 756. 810. 864.]
test eval:
Mean squared error: 3.9027777777777777
Correlation coefficient: 0.3868950782276027
Coefficient of determination (R-squared score, R2 score): 0.0101276970497578
train eval:
Mean squared error: 1.388157894736842
Correlation coefficient: 0.7900191528231353
Coefficient of determination (R-squared score, R2 score): 0.5895678397549002
---- ---- ----
class_num = 880
Number of unique elements: 375
[  0.  55. 110. 165. 220. 275. 330. 385. 440. 495. 550. 605. 660. 715.
 770. 825. 880.]
test eval:
Mean squared error: 3.4444444444444446
Correlation coefficient: 0.49664932833991754
Coefficient of determination (R-squared score, R2 score): 0.1263760457948041
train eval:
Mean squared error: 1.4619883040935673
Correlation coefficient: 0.7809029480746438
Coefficient of determination (R-squared score, R2 score): 0.5677386411320697
---- ---- ----
class_num = 896
Number of unique elements: 388
[  0.  56. 112. 168. 224. 280. 336. 392. 448. 504. 560. 616. 672. 728.
 784. 840. 896.]
test eval:
Mean squared error: 3.3333333333333335
Correlation coefficient: 0.5034573464200038
Coefficient of determination (R-squared score, R2 score): 0.15455746367239098
train eval:
Mean squared error: 1.41593567251462
Correlation coefficient: 0.7847663093290186
Coefficient of determination (R-squared score, R2 score): 0.5813548739364096
---- ---- ----
class_num = 912
Number of unique elements: 393
[  0.  57. 114. 171. 228. 285. 342. 399. 456. 513. 570. 627. 684. 741.
 798. 855. 912.]
test eval:
Mean squared error: 4.166666666666667
Correlation coefficient: 0.4071858212864697
Coefficient of determination (R-squared score, R2 score): -0.05680317040951133
train eval:
Mean squared error: 1.361842105263158
Correlation coefficient: 0.7938235266125524
Coefficient of determination (R-squared score, R2 score): 0.597348544214523
---- ---- ----
class_num = 928
Number of unique elements: 393
[  0.  58. 116. 174. 232. 290. 348. 406. 464. 522. 580. 638. 696. 754.
 812. 870. 928.]
test eval:
Mean squared error: 3.0277777777777777
Correlation coefficient: 0.5306120337926347
Coefficient of determination (R-squared score, R2 score): 0.23205636283575515
train eval:
Mean squared error: 1.3640350877192982
Correlation coefficient: 0.7928116156624162
Coefficient of determination (R-squared score, R2 score): 0.5967001521762211
---- ---- ----
class_num = 944
Number of unique elements: 398
[  0.  59. 118. 177. 236. 295. 354. 413. 472. 531. 590. 649. 708. 767.
 826. 885. 944.]
test eval:
Mean squared error: 3.2777777777777777
Correlation coefficient: 0.5038584858703183
Coefficient of determination (R-squared score, R2 score): 0.16864817261118448
train eval:
Mean squared error: 1.3801169590643274
Correlation coefficient: 0.7907118781047191
Coefficient of determination (R-squared score, R2 score): 0.5919452772286737
---- ---- ----
class_num = 960
Number of unique elements: 398
[  0.  60. 120. 180. 240. 300. 360. 420. 480. 540. 600. 660. 720. 780.
 840. 900. 960.]
test eval:
Mean squared error: 3.4305555555555554
Correlation coefficient: 0.4767013049209689
Coefficient of determination (R-squared score, R2 score): 0.12989872302950245
train eval:
Mean squared error: 1.3494152046783625
Correlation coefficient: 0.7964253901758411
Coefficient of determination (R-squared score, R2 score): 0.6010227657649003
---- ---- ----
class_num = 976
Number of unique elements: 408
[  0.  61. 122. 183. 244. 305. 366. 427. 488. 549. 610. 671. 732. 793.
 854. 915. 976.]
test eval:
Mean squared error: 3.986111111111111
Correlation coefficient: 0.46251262770571977
Coefficient of determination (R-squared score, R2 score): -0.011008366358432387
train eval:
Mean squared error: 1.404970760233918
Correlation coefficient: 0.7901530589059842
Coefficient of determination (R-squared score, R2 score): 0.5845968341279191
---- ---- ----
class_num = 992
Number of unique elements: 406
[  0.  62. 124. 186. 248. 310. 372. 434. 496. 558. 620. 682. 744. 806.
 868. 930. 992.]
test eval:
Mean squared error: 4.25
Correlation coefficient: 0.4169227166945107
Coefficient of determination (R-squared score, R2 score): -0.0779392338177014
train eval:
Mean squared error: 1.391812865497076
Correlation coefficient: 0.7894680575074436
Coefficient of determination (R-squared score, R2 score): 0.5884871863577303
---- ---- ----
class_num = 1008
Number of unique elements: 414
[   0.   63.  126.  189.  252.  315.  378.  441.  504.  567.  630.  693.
  756.  819.  882.  945. 1008.]
test eval:
Mean squared error: 4.055555555555555
Correlation coefficient: 0.4171084530480764
Coefficient of determination (R-squared score, R2 score): -0.028621752531924338
train eval:
Mean squared error: 1.3874269005847952
Correlation coefficient: 0.7918170168929646
Coefficient of determination (R-squared score, R2 score): 0.5897839704343342
---- ---- ----
class_num = 1024
Number of unique elements: 421
[   0.   64.  128.  192.  256.  320.  384.  448.  512.  576.  640.  704.
  768.  832.  896.  960. 1024.]
test eval:
Mean squared error: 4.097222222222222
Correlation coefficient: 0.369574849704913
Coefficient of determination (R-squared score, R2 score): -0.039189784236019376
train eval:
Mean squared error: 1.331140350877193
Correlation coefficient: 0.7977501577967256
Coefficient of determination (R-squared score, R2 score): 0.6064260327507495
---- ---- ----
class_num = 1040
Number of unique elements: 429
[   0.   65.  130.  195.  260.  325.  390.  455.  520.  585.  650.  715.
  780.  845.  910.  975. 1040.]
test eval:
Mean squared error: 4.222222222222222
Correlation coefficient: 0.38461444972865816
Coefficient of determination (R-squared score, R2 score): -0.07089387934830471
train eval:
Mean squared error: 1.3991228070175439
Correlation coefficient: 0.7902467723105108
Coefficient of determination (R-squared score, R2 score): 0.5863258795633908
---- ---- ----
class_num = 1056
Number of unique elements: 428
[   0.   66.  132.  198.  264.  330.  396.  462.  528.  594.  660.  726.
  792.  858.  924.  990. 1056.]
test eval:
Mean squared error: 3.3055555555555554
Correlation coefficient: 0.5066201824582414
Coefficient of determination (R-squared score, R2 score): 0.16160281814178779
train eval:
Mean squared error: 1.310672514619883
Correlation coefficient: 0.8034802783088595
Coefficient of determination (R-squared score, R2 score): 0.6124776917749005
---- ---- ----
class_num = 1072
Number of unique elements: 435
[   0.   67.  134.  201.  268.  335.  402.  469.  536.  603.  670.  737.
  804.  871.  938. 1005. 1072.]
test eval:
Mean squared error: 3.4722222222222223
Correlation coefficient: 0.497773439480096
Coefficient of determination (R-squared score, R2 score): 0.1193306913254073
train eval:
Mean squared error: 1.2214912280701755
Correlation coefficient: 0.8173079807102378
Coefficient of determination (R-squared score, R2 score): 0.6388456346658442
---- ---- ----
class_num = 1088
Number of unique elements: 437
[   0.   68.  136.  204.  272.  340.  408.  476.  544.  612.  680.  748.
  816.  884.  952. 1020. 1088.]
test eval:
Mean squared error: 3.6527777777777777
Correlation coefficient: 0.45987763081270244
Coefficient of determination (R-squared score, R2 score): 0.07353588727432847
train eval:
Mean squared error: 1.2982456140350878
Correlation coefficient: 0.8068687064143895
Coefficient of determination (R-squared score, R2 score): 0.616151913325278
---- ---- ----
class_num = 1104
Number of unique elements: 435
[   0.   69.  138.  207.  276.  345.  414.  483.  552.  621.  690.  759.
  828.  897.  966. 1035. 1104.]
test eval:
Mean squared error: 4.125
Correlation coefficient: 0.3830549835135723
Coefficient of determination (R-squared score, R2 score): -0.04623513870541607
train eval:
Mean squared error: 1.280701754385965
Correlation coefficient: 0.8082102913672906
Coefficient of determination (R-squared score, R2 score): 0.6213390496316931
---- ---- ----
class_num = 1120
Number of unique elements: 443
[   0.   70.  140.  210.  280.  350.  420.  490.  560.  630.  700.  770.
  840.  910.  980. 1050. 1120.]
test eval:
Mean squared error: 4.625
Correlation coefficient: 0.3520103257330956
Coefficient of determination (R-squared score, R2 score): -0.1730515191545574
train eval:
Mean squared error: 1.2163742690058479
Correlation coefficient: 0.8173583185109973
Coefficient of determination (R-squared score, R2 score): 0.640358549421882
---- ---- ----
class_num = 1136
Number of unique elements: 446
[   0.   71.  142.  213.  284.  355.  426.  497.  568.  639.  710.  781.
  852.  923.  994. 1065. 1136.]
test eval:
Mean squared error: 3.763888888888889
Correlation coefficient: 0.452082055121772
Coefficient of determination (R-squared score, R2 score): 0.04535446939674148
train eval:
Mean squared error: 1.236842105263158
Correlation coefficient: 0.8162370541258687
Coefficient of determination (R-squared score, R2 score): 0.6343068903977309
---- ---- ----
class_num = 1152
Number of unique elements: 453
[   0.   72.  144.  216.  288.  360.  432.  504.  576.  648.  720.  792.
  864.  936. 1008. 1080. 1152.]
test eval:
Mean squared error: 3.3055555555555554
Correlation coefficient: 0.5030456830970099
Coefficient of determination (R-squared score, R2 score): 0.16160281814178779
train eval:
Mean squared error: 1.253654970760234
Correlation coefficient: 0.8130056879325639
Coefficient of determination (R-squared score, R2 score): 0.6293358847707498
---- ---- ----
class_num = 1168
Number of unique elements: 463
[   0.   73.  146.  219.  292.  365.  438.  511.  584.  657.  730.  803.
  876.  949. 1022. 1095. 1168.]
test eval:
Mean squared error: 3.763888888888889
Correlation coefficient: 0.41847194394248616
Coefficient of determination (R-squared score, R2 score): 0.04535446939674148
train eval:
Mean squared error: 1.159356725146199
Correlation coefficient: 0.8288134090867733
Coefficient of determination (R-squared score, R2 score): 0.6572167424177313
---- ---- ----
class_num = 1184
Number of unique elements: 457
[   0.   74.  148.  222.  296.  370.  444.  518.  592.  666.  740.  814.
  888.  962. 1036. 1110. 1184.]
test eval:
Mean squared error: 3.5555555555555554
Correlation coefficient: 0.4540681400432646
Coefficient of determination (R-squared score, R2 score): 0.09819462791721711
train eval:
Mean squared error: 1.2660818713450293
Correlation coefficient: 0.811768015898878
Coefficient of determination (R-squared score, R2 score): 0.6256616632203724
---- ---- ----
class_num = 1200
Number of unique elements: 464
[   0.   75.  150.  225.  300.  375.  450.  525.  600.  675.  750.  825.
  900.  975. 1050. 1125. 1200.]
test eval:
Mean squared error: 3.8055555555555554
Correlation coefficient: 0.39485006534468375
Coefficient of determination (R-squared score, R2 score): 0.034786437692646444
train eval:
Mean squared error: 1.2002923976608186
Correlation coefficient: 0.8162717842466877
Coefficient of determination (R-squared score, R2 score): 0.6451134243694292
---- ---- ----
class_num = 1216
Number of unique elements: 476
[   0.   76.  152.  228.  304.  380.  456.  532.  608.  684.  760.  836.
  912.  988. 1064. 1140. 1216.]
test eval:
Mean squared error: 3.9166666666666665
Correlation coefficient: 0.347297234354348
Coefficient of determination (R-squared score, R2 score): 0.006605019815059454
train eval:
Mean squared error: 1.0891812865497077
Correlation coefficient: 0.8364000477174324
Coefficient of determination (R-squared score, R2 score): 0.6779652876433919
---- ---- ----
class_num = 1232
Number of unique elements: 480
[   0.   77.  154.  231.  308.  385.  462.  539.  616.  693.  770.  847.
  924. 1001. 1078. 1155. 1232.]
test eval:
Mean squared error: 3.4305555555555554
Correlation coefficient: 0.49179006421871096
Coefficient of determination (R-squared score, R2 score): 0.12989872302950245
train eval:
Mean squared error: 1.1900584795321638
Correlation coefficient: 0.8222615851130448
Coefficient of determination (R-squared score, R2 score): 0.6481392538815047
---- ---- ----
class_num = 1248
Number of unique elements: 474
[   0.   78.  156.  234.  312.  390.  468.  546.  624.  702.  780.  858.
  936. 1014. 1092. 1170. 1248.]
test eval:
Mean squared error: 3.1805555555555554
Correlation coefficient: 0.5440613383250746
Coefficient of determination (R-squared score, R2 score): 0.19330691325407312
train eval:
Mean squared error: 1.1396198830409356
Correlation coefficient: 0.8289047916554041
Coefficient of determination (R-squared score, R2 score): 0.6630522707624483
---- ---- ----
class_num = 1264
Number of unique elements: 487
[   0.   79.  158.  237.  316.  395.  474.  553.  632.  711.  790.  869.
  948. 1027. 1106. 1185. 1264.]
test eval:
Mean squared error: 3.611111111111111
Correlation coefficient: 0.44627046270937537
Coefficient of determination (R-squared score, R2 score): 0.08410391897842362
train eval:
Mean squared error: 1.0467836257309941
Correlation coefficient: 0.8426554017178337
Coefficient of determination (R-squared score, R2 score): 0.6905008670505619
---- ---- ----
class_num = 1280
Number of unique elements: 488
[   0.   80.  160.  240.  320.  400.  480.  560.  640.  720.  800.  880.
  960. 1040. 1120. 1200. 1280.]
test eval:
Mean squared error: 3.736111111111111
Correlation coefficient: 0.4095466365789959
Coefficient of determination (R-squared score, R2 score): 0.052399823866138284
train eval:
Mean squared error: 1.1396198830409356
Correlation coefficient: 0.8324861876534952
Coefficient of determination (R-squared score, R2 score): 0.6630522707624483
---- ---- ----
class_num = 1296
Number of unique elements: 492
[   0.   81.  162.  243.  324.  405.  486.  567.  648.  729.  810.  891.
  972. 1053. 1134. 1215. 1296.]
test eval:
Mean squared error: 3.7916666666666665
Correlation coefficient: 0.42597061673103304
Coefficient of determination (R-squared score, R2 score): 0.03830911492734479
train eval:
Mean squared error: 1.036549707602339
Correlation coefficient: 0.8427967258110125
Coefficient of determination (R-squared score, R2 score): 0.6935266965626374
---- ---- ----
class_num = 1312
Number of unique elements: 499
[   0.   82.  164.  246.  328.  410.  492.  574.  656.  738.  820.  902.
  984. 1066. 1148. 1230. 1312.]
test eval:
Mean squared error: 3.4722222222222223
Correlation coefficient: 0.46263440199102385
Coefficient of determination (R-squared score, R2 score): 0.1193306913254073
train eval:
Mean squared error: 1.0328947368421053
Correlation coefficient: 0.8440983238529769
Coefficient of determination (R-squared score, R2 score): 0.6946073499598073
---- ---- ----
class_num = 1328
Number of unique elements: 497
[   0.   83.  166.  249.  332.  415.  498.  581.  664.  747.  830.  913.
  996. 1079. 1162. 1245. 1328.]
test eval:
Mean squared error: 4.180555555555555
Correlation coefficient: 0.3639564680255573
Coefficient of determination (R-squared score, R2 score): -0.060325847644209674
train eval:
Mean squared error: 1.0928362573099415
Correlation coefficient: 0.8362356797569905
Coefficient of determination (R-squared score, R2 score): 0.6768846342462221
---- ---- ----
class_num = 1344
Number of unique elements: 501
[   0.   84.  168.  252.  336.  420.  504.  588.  672.  756.  840.  924.
 1008. 1092. 1176. 1260. 1344.]
test eval:
Mean squared error: 3.5555555555555554
Correlation coefficient: 0.4278885631510579
Coefficient of determination (R-squared score, R2 score): 0.09819462791721711
train eval:
Mean squared error: 1.1279239766081872
Correlation coefficient: 0.831651084013057
Coefficient of determination (R-squared score, R2 score): 0.6665103616333918
---- ---- ----
class_num = 1360
Number of unique elements: 503
[   0.   85.  170.  255.  340.  425.  510.  595.  680.  765.  850.  935.
 1020. 1105. 1190. 1275. 1360.]
test eval:
Mean squared error: 3.4444444444444446
Correlation coefficient: 0.4750872969502953
Coefficient of determination (R-squared score, R2 score): 0.1263760457948041
train eval:
Mean squared error: 1.1308479532163742
Correlation coefficient: 0.8315706101253245
Coefficient of determination (R-squared score, R2 score): 0.6656458389156559
---- ---- ----
class_num = 1376
Number of unique elements: 511
[   0.   86.  172.  258.  344.  430.  516.  602.  688.  774.  860.  946.
 1032. 1118. 1204. 1290. 1376.]
test eval:
Mean squared error: 3.986111111111111
Correlation coefficient: 0.344048389033826
Coefficient of determination (R-squared score, R2 score): -0.011008366358432387
train eval:
Mean squared error: 1.0328947368421053
Correlation coefficient: 0.844437469232195
Coefficient of determination (R-squared score, R2 score): 0.6946073499598073
---- ---- ----
class_num = 1392
Number of unique elements: 519
[   0.   87.  174.  261.  348.  435.  522.  609.  696.  783.  870.  957.
 1044. 1131. 1218. 1305. 1392.]
test eval:
Mean squared error: 4.027777777777778
Correlation coefficient: 0.3611800633782349
Coefficient of determination (R-squared score, R2 score): -0.021576398062527424
train eval:
Mean squared error: 1.0453216374269005
Correlation coefficient: 0.8432310597872611
Coefficient of determination (R-squared score, R2 score): 0.6909331284094298
---- ---- ----
class_num = 1408
Number of unique elements: 510
[   0.   88.  176.  264.  352.  440.  528.  616.  704.  792.  880.  968.
 1056. 1144. 1232. 1320. 1408.]
test eval:
Mean squared error: 4.388888888888889
Correlation coefficient: 0.3307964928843021
Coefficient of determination (R-squared score, R2 score): -0.11316600616468508
train eval:
Mean squared error: 1.144736842105263
Correlation coefficient: 0.828920419863793
Coefficient of determination (R-squared score, R2 score): 0.6615393560064107
---- ---- ----
class_num = 1424
Number of unique elements: 523
[   0.   89.  178.  267.  356.  445.  534.  623.  712.  801.  890.  979.
 1068. 1157. 1246. 1335. 1424.]
test eval:
Mean squared error: 3.8055555555555554
Correlation coefficient: 0.37666407530779566
Coefficient of determination (R-squared score, R2 score): 0.034786437692646444
train eval:
Mean squared error: 1.1038011695906433
Correlation coefficient: 0.833746816153345
Coefficient of determination (R-squared score, R2 score): 0.6736426740547126
---- ---- ----
class_num = 1440
Number of unique elements: 519
[   0.   90.  180.  270.  360.  450.  540.  630.  720.  810.  900.  990.
 1080. 1170. 1260. 1350. 1440.]
test eval:
Mean squared error: 3.875
Correlation coefficient: 0.39339416969687585
Coefficient of determination (R-squared score, R2 score): 0.017173051519154603
train eval:
Mean squared error: 1.162280701754386
Correlation coefficient: 0.8264827454021899
Coefficient of determination (R-squared score, R2 score): 0.6563522196999954
---- ---- ----
class_num = 1456
Number of unique elements: 530
[   0.   91.  182.  273.  364.  455.  546.  637.  728.  819.  910. 1001.
 1092. 1183. 1274. 1365. 1456.]
test eval:
Mean squared error: 3.4305555555555554
Correlation coefficient: 0.46730321353244414
Coefficient of determination (R-squared score, R2 score): 0.12989872302950245
train eval:
Mean squared error: 1.118421052631579
Correlation coefficient: 0.8330034585932155
Coefficient of determination (R-squared score, R2 score): 0.6693200604660333
---- ---- ----
class_num = 1472
Number of unique elements: 525
[   0.   92.  184.  276.  368.  460.  552.  644.  736.  828.  920. 1012.
 1104. 1196. 1288. 1380. 1472.]
test eval:
Mean squared error: 4.402777777777778
Correlation coefficient: 0.33778786844126274
Coefficient of determination (R-squared score, R2 score): -0.11668868339938343
train eval:
Mean squared error: 1.1206140350877194
Correlation coefficient: 0.8328181060974977
Coefficient of determination (R-squared score, R2 score): 0.6686716684277314
---- ---- ----
class_num = 1488
Number of unique elements: 532
[   0.   93.  186.  279.  372.  465.  558.  651.  744.  837.  930. 1023.
 1116. 1209. 1302. 1395. 1488.]
test eval:
Mean squared error: 3.9027777777777777
Correlation coefficient: 0.3973386098897478
Coefficient of determination (R-squared score, R2 score): 0.0101276970497578
train eval:
Mean squared error: 0.9868421052631579
Correlation coefficient: 0.8513763519013561
Coefficient of determination (R-squared score, R2 score): 0.708223582764147
---- ---- ----
class_num = 1504
Number of unique elements: 534
[   0.   94.  188.  282.  376.  470.  564.  658.  752.  846.  940. 1034.
 1128. 1222. 1316. 1410. 1504.]
test eval:
Mean squared error: 3.7777777777777777
Correlation coefficient: 0.3976083689307804
Coefficient of determination (R-squared score, R2 score): 0.041831792162043135
train eval:
Mean squared error: 1.0687134502923976
Correlation coefficient: 0.8414098886090738
Coefficient of determination (R-squared score, R2 score): 0.684016946667543
---- ---- ----
class_num = 1520
Number of unique elements: 538
[   0.   95.  190.  285.  380.  475.  570.  665.  760.  855.  950. 1045.
 1140. 1235. 1330. 1425. 1520.]
test eval:
Mean squared error: 3.736111111111111
Correlation coefficient: 0.4452256456517434
Coefficient of determination (R-squared score, R2 score): 0.052399823866138284
train eval:
Mean squared error: 0.9963450292397661
Correlation coefficient: 0.8493180191035392
Coefficient of determination (R-squared score, R2 score): 0.7054138839315055
---- ---- ----
class_num = 1536
Number of unique elements: 543
[   0.   96.  192.  288.  384.  480.  576.  672.  768.  864.  960. 1056.
 1152. 1248. 1344. 1440. 1536.]
test eval:
Mean squared error: 4.361111111111111
Correlation coefficient: 0.30649124324470645
Coefficient of determination (R-squared score, R2 score): -0.10612065169528839
train eval:
Mean squared error: 0.9254385964912281
Correlation coefficient: 0.8604761215901255
Coefficient of determination (R-squared score, R2 score): 0.7263785598366002
---- ---- ----
class_num = 1552
Number of unique elements: 552
[   0.   97.  194.  291.  388.  485.  582.  679.  776.  873.  970. 1067.
 1164. 1261. 1358. 1455. 1552.]
test eval:
Mean squared error: 4.916666666666667
Correlation coefficient: 0.3376522863225631
Coefficient of determination (R-squared score, R2 score): -0.24702774108322334
train eval:
Mean squared error: 1.1798245614035088
Correlation coefficient: 0.8275789495015552
Coefficient of determination (R-squared score, R2 score): 0.6511650833935803
---- ---- ----
class_num = 1568
Number of unique elements: 546
[   0.   98.  196.  294.  392.  490.  588.  686.  784.  882.  980. 1078.
 1176. 1274. 1372. 1470. 1568.]
test eval:
Mean squared error: 3.8333333333333335
Correlation coefficient: 0.36276795113160476
Coefficient of determination (R-squared score, R2 score): 0.02774108322324964
train eval:
Mean squared error: 1.1308479532163742
Correlation coefficient: 0.8311792384892888
Coefficient of determination (R-squared score, R2 score): 0.6656458389156559
---- ---- ----
class_num = 1584
Number of unique elements: 553
[   0.   99.  198.  297.  396.  495.  594.  693.  792.  891.  990. 1089.
 1188. 1287. 1386. 1485. 1584.]
test eval:
Mean squared error: 3.75
Correlation coefficient: 0.4037425622438564
Coefficient of determination (R-squared score, R2 score): 0.04887714663143994
train eval:
Mean squared error: 1.0504385964912282
Correlation coefficient: 0.8422369224415918
Coefficient of determination (R-squared score, R2 score): 0.6894202136533921
---- ---- ----
class_num = 1600
Number of unique elements: 554
[   0.  100.  200.  300.  400.  500.  600.  700.  800.  900. 1000. 1100.
 1200. 1300. 1400. 1500. 1600.]
test eval:
Mean squared error: 3.8055555555555554
Correlation coefficient: 0.3784207614719348
Coefficient of determination (R-squared score, R2 score): 0.034786437692646444
train eval:
Mean squared error: 1.0789473684210527
Correlation coefficient: 0.8385805409196683
Coefficient of determination (R-squared score, R2 score): 0.6809911171554675
In [ ]:
# plot the trend figures of mse, correlation, and r2

# Create a figure and subplots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 12))

# Plot MSE
ax1.plot(class_num_array, mse_test_list, label='MSE (Test)')
ax1.plot(class_num_array, mse_train_list, label='MSE (Train)')
ax1.set_ylabel('MSE')
ax1.set_xlabel('Original Class Number')
ax1.set_title(f'MSE Curve (reduced class num is {reduced_class_num})')
ax1.legend()

# Plot Correlation
ax2.plot(class_num_array, correlation_test_list, label='Correlation (Test)')
ax2.plot(class_num_array, correlation_train_list, label='Correlation (Train)')
ax2.set_ylabel('Correlation')
ax2.set_xlabel('Original Class Number')
ax2.set_title(f'Correlation Curve (reduced class num is {reduced_class_num})')
ax2.legend()

# Plot R-squared
ax3.plot(class_num_array, r_squared_test_list, label='R-squared (Test)')
ax3.plot(class_num_array, r_squared_train_list, label='R-squared (Train)')
ax3.set_ylabel('R-squared')
ax3.set_xlabel('Original Class Number')
ax3.set_title(f'R-squared Curve (reduced class num is {reduced_class_num})')
ax3.legend()

# Increase the vertical spacing between subplots
plt.subplots_adjust(hspace=0.5)

# Adjust tick, label, title, and legend font sizes
plt.rcParams.update({'font.size': 12})
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax3.tick_params(labelsize=10)

# Save the figure
plt.savefig(f'mse_correlation_r2_trend_curve_reduced_eval_reduced_class_num_{reduced_class_num}.png', bbox_inches='tight')

# Show the figure
plt.show()

# Convert r_squared_test_list to a NumPy array
r_squared_test_array = np.array(r_squared_test_list)
# Find the index of the maximum value
max_index = np.argmax(r_squared_test_array)
# Get the corresponding class_num value
max_class_num = class_num_array[max_index]

# Print the index and corresponding class_num
print("Max Index:", max_index)
print("Max Original Class Num:", max_class_num)
Max Index: 29
Max Original Class Num: 480
In [ ]:
# batch download the plotted figures
# uncomment the code below to download figures if needed

'''
import glob

folder_path = '.'
file_prefix = 'Comparison of Sorted Predictions and Sorted Ground Truth (Logistic Linear Regression Reduced Evaluation'

# Use glob to find all files with the given prefix in the folder
matching_files = glob.glob(f"{folder_path}/{file_prefix}*")
# print(matching_files)
# # Print the matching file names
# for file_path in matching_files:
#     print(file_path)

import zipfile

zip_filename = 'files.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add files to the zip file
    for file_path in matching_files:
        zipf.write(file_path)

from google.colab import files
files.download(zip_filename)
'''

Fit and predict (balanced weights)¶

balanced weights don't improve the fitting, but make it worse.

In [ ]:
# independent data
x = group_satcked_green

class_num = 48

# dependent data (labels/targets)
y = np.squeeze(stacked_red)
# print(np.max(y), np.min(y))

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
# print(intervals)
# Digitize the array to get the indices of the intervals
y_train = np.digitize(y_train, intervals) - 1
y_test = np.digitize(y_test, intervals) - 1
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# to see unique elements (see if we have all 0, 1,..., class_num-1 classes, better close to all)
unique_elements = np.unique(y_train)
print("Unique elements:", unique_elements)
print("Number of unique elements:", len(unique_elements))
x_train shape: (1368, 25)
y_train shape: (1368,)
x_test shape: (72, 25)
y_test shape: (72,)
Unique elements: [ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 38 39 42 47]
Number of unique elements: 40
In [ ]:
# fit
model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, class_weight='balanced', multi_class='multinomial')
fit_result = model.fit(x_train, y_train)
print(fit_result.intercept_.shape, fit_result.coef_.shape)

# predict
# Use the trained model to make predictions
y_pred = model.predict(x_test)
# Alternatively, you can get the predicted probabilities for each class
y_prob = model.predict_proba(x_test)
# Print the predicted class labels
print(y_pred, y_test)
print(y_pred.shape, y_test.shape)
# Print the predicted probabilities
# print(y_prob)
(40,) (40, 25)
[ 5 27 22 22  9 29 11 25 22 29  5 23  5 10  6  8  8 29 22 10 26 36 11 26
 12 16 22 26 31 34 10  9 29  7 24  6 32 13 20 10  9 34  8 21 14  7 29 28
 19 11  9 20  8 14 10 23 20  3 28  3 12 10 14 11 19  3 13 20 20 12 20 24] [19 13 25 35 13 21 10 18 17 28 15 18  7 16 10  8 10 12 25 29 14 21 12 16
 11 16 13 14  9 18 15 13 18 12 11 10 30 16 14 12 12 12 30 12 12 13 17 28
 11 17  9 15 11 15 16 22 18 10 17  7 18 11 12  9 15 13  8 22 17 11 16 16]
(72,) (72,)

Evaluate (balanced weights)¶

In [ ]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#           |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)


plot_comparison(y_test, y_pred, 'Logistic Linear Regression balanced weights, Test Set')
Accuracy: 0.05555555555555555
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Correlation coefficient: 0.4046314021861102
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = model.predict(x_train)
# Alternatively, you can get the predicted probabilities for each class
y_prob_ = model.predict_proba(x_train)

accuracy = accuracy_score(y_train, y_pred_)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_train, y_pred_)
print("Confusion Matrix:")
print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#              |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)


plot_comparison(y_train, y_pred_, 'Logistic Linear Regression balanced weights, Train Set')
Accuracy: 0.15862573099415206
Confusion Matrix:
[[2 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 2]]
Correlation coefficient: 0.5813149974591648